Skip to content

Commit

Permalink
Merge pull request #53 from vmuriart/Refactor-ticker_counter
Browse files Browse the repository at this point in the history
Refactor ticker_counter.py
  • Loading branch information
iam-abbas committed Feb 13, 2021
2 parents 9233d65 + 272f2b0 commit 59cd6d0
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 57 deletions.
28 changes: 14 additions & 14 deletions back/config/config.ini
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
[FilteringOptions]
StopWords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "you're", "you've", "you'll", "you'd", "your", "yours",
"yourself", "yourselves", "he", "him", "his", "himself", "she", "she's", "her", "hers", "herself", "it", "it's", "its",
"itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "that'll",
"these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does",
"did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for",
"with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from",
"up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when",
"where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not",
"only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "don't", "should", "should've",
"now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "aren't", "couldn", "couldn't", "didn", "didn't", "doesn",
"doesn't", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", "isn", "isn't", "ma", "mightn", "mightn't", "must",
"mustn", "mustn't", "needn", "needn't", "shan", "shan't", "shouldn", "shouldn't", "wasn", "wasn't", "weren", "weren't",
"won", "won't", "wouldn", "wouldn't"]
StopWords = ["I", "ME", "MY", "MYSELF", "WE", "OUR", "OURS", "OURSELVES", "YOU", "YOU'RE", "YOU'VE", "YOU'LL", "YOU'D", "YOUR", "YOURS",
"YOURSELF", "YOURSELVES", "HE", "HIM", "HIS", "HIMSELF", "SHE", "SHE'S", "HER", "HERS", "HERSELF", "IT", "IT'S", "ITS",
"ITSELF", "THEY", "THEM", "THEIR", "THEIRS", "THEMSELVES", "WHAT", "WHICH", "WHO", "WHOM", "THIS", "THAT", "THAT'LL",
"THESE", "THOSE", "AM", "IS", "ARE", "WAS", "WERE", "BE", "BEEN", "BEING", "HAVE", "HAS", "HAD", "HAVING", "DO", "DOES",
"DID", "DOING", "A", "AN", "THE", "AND", "BUT", "IF", "OR", "BECAUSE", "AS", "UNTIL", "WHILE", "OF", "AT", "BY", "FOR",
"WITH", "ABOUT", "AGAINST", "BETWEEN", "INTO", "THROUGH", "DURING", "BEFORE", "AFTER", "ABOVE", "BELOW", "TO", "FROM",
"UP", "DOWN", "IN", "OUT", "ON", "OFF", "OVER", "UNDER", "AGAIN", "FURTHER", "THEN", "ONCE", "HERE", "THERE", "WHEN",
"WHERE", "WHY", "HOW", "ALL", "ANY", "BOTH", "EACH", "FEW", "MORE", "MOST", "OTHER", "SOME", "SUCH", "NO", "NOR", "NOT",
"ONLY", "OWN", "SAME", "SO", "THAN", "TOO", "VERY", "S", "T", "CAN", "WILL", "JUST", "DON", "DON'T", "SHOULD", "SHOULD'VE",
"NOW", "D", "LL", "M", "O", "RE", "VE", "Y", "AIN", "AREN", "AREN'T", "COULDN", "COULDN'T", "DIDN", "DIDN'T", "DOESN",
"DOESN'T", "HADN", "HADN'T", "HASN", "HASN'T", "HAVEN", "HAVEN'T", "ISN", "ISN'T", "MA", "MIGHTN", "MIGHTN'T", "MUST",
"MUSTN", "MUSTN'T", "NEEDN", "NEEDN'T", "SHAN", "SHAN'T", "SHOULDN", "SHOULDN'T", "WASN", "WASN'T", "WEREN", "WEREN'T",
"WON", "WON'T", "WOULDN", "WOULDN'T"]
BlockWords = ["DIP", "", "$", "RH", "YOLO", "PORN", "BEST", "MOON", "HOLD", "FAKE", "WISH", "USD", "EV", "MARK", "RELAX", "LOL", "LMAO",
"LMFAO", "EPS", "DCF", "NYSE", "FTSE", "APE", "CEO", "CTO", "FUD", "DD", "AM", "PM", "FDD", "EDIT", "TA", "UK", "AMC", "GME"]
"LMFAO", "EPS", "DCF", "NYSE", "FTSE", "APE", "CEO", "CTO", "FUD", "DD", "AM", "PM", "FDD", "EDIT", "TA", "UK", "AMC", "GME"]
Subreddits = ["robinhoodpennystocks","pennystocks"]
70 changes: 27 additions & 43 deletions back/ticker_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
import json
import re
from collections import Counter, namedtuple
from functools import reduce
from operator import add
from itertools import chain
from pathlib import Path
from typing import Set

Expand All @@ -16,39 +15,32 @@


class TickerCounts:
WEBSCRAPER_LIMIT = 2000
config = configparser.ConfigParser()
config.read('./config/config.ini')
stop_words = json.loads(config['FilteringOptions']['StopWords'])
block_words = json.loads(config['FilteringOptions']['BlockWords'])
subreddits = json.loads(config['FilteringOptions']['Subreddits'])
with open('./config/tickers.json') as f:
tickers = json.load(f)

def verify_ticker(self, tick):
return tick in self.tickers

def extract_ticker(self, body: str, re_string: str = r'\$[A-Za-z]+|[A-Z]{2,}') -> Set[str]:

def __init__(self):
self.webscraper_limit = 2000
config = configparser.ConfigParser()
config.read('./config/config.ini')
self.subreddits = json.loads(config['FilteringOptions']['Subreddits'])

stop_words = set(json.loads(config['FilteringOptions']['StopWords']))
block_words = set(json.loads(config['FilteringOptions']['BlockWords']))
with open('./config/tickers.json') as f:
tickers = set(json.load(f))
exclude = stop_words | block_words
self.keep_tickers = tickers - exclude # Remove words/tickers in exclude

def extract_ticker(self, text: str, pattern: str = r'(?<=\$)[A-Za-z]+|[A-Z]{2,}') -> Set[str]:
"""Simple Regex to get tickers from text."""
ticks = set(re.findall(re_string, str(body)))
res = set()
for item in ticks:
if item not in self.block_words and item.lower() not in self.stop_words and item:
try:
tick = item.replace('$', '').upper()
res.add(tick)
except Exception as e:
print(e)
return res
ticks = set(re.findall(pattern, str(text)))
return ticks & self.keep_tickers # Keep overlap

def _get_posts(self):
# Scrape subreddits `r/robinhoodpennystocks` and `r/pennystocks`
# Current it does fetch a lot of additional data like upvotes, comments, awards etc but not using anything apart from title for now
# Scrape subreddits. Currently it fetches additional data, only using title for now
reddit = praw.Reddit('ClientSecrets')
subreddits = '+'.join(self.subreddits)
new_bets = reddit.subreddit(subreddits).new(limit=self.WEBSCRAPER_LIMIT)
new_bets = reddit.subreddit(subreddits).new(limit=self.webscraper_limit)

for post in tqdm(new_bets, desc='Selecting relevant data from webscraper', total=self.WEBSCRAPER_LIMIT):
for post in tqdm(new_bets, desc='Selecting relevant data from webscraper', total=self.webscraper_limit):
yield Post(
post.id,
post.title,
Expand All @@ -61,21 +53,13 @@ def _get_posts(self):
def get_data(self):
df_posts = pd.DataFrame(self._get_posts())

# Extract tickers from all titles and create a new column
df_posts['Tickers'] = df_posts['title'].apply(self.extract_ticker)
tickers = df_posts['Tickers']

# Count number of occurrences of the Ticker and verify id the Ticker exists
counts = reduce(add, map(Counter, tickers))

verified_ticks = {}
for ticker, ticker_count in tqdm(counts.items(), desc='Filtering verified ticks'):
# If ticker is found more than 3 times and ticker is valid
if ticker_count > 3 and self.verify_ticker(ticker):
verified_ticks[ticker] = ticker_count
# Extract tickers from titles & count them
tickers = df_posts['title'].apply(self.extract_ticker)
counts = Counter(chain.from_iterable(tickers))

# Create Datable of just mentions
df_tick = pd.DataFrame(verified_ticks.items(), columns=['Ticker', 'Mentions'])
# Create DataFrame of just mentions & remove any occurring less than 3 or less
df_tick = pd.DataFrame(counts.items(), columns=['Ticker', 'Mentions'])
df_tick = df_tick[df_tick['Mentions'] > 3]
df_tick = df_tick.sort_values(by=['Mentions'], ascending=False)

data_directory = Path('./data')
Expand Down

0 comments on commit 59cd6d0

Please sign in to comment.