Merge pull request #53 from vmuriart/Refactor-ticker_counter

Refactor ticker_counter.py
iam-abbas · Feb 13, 2021 · 59cd6d0 · 59cd6d0
2 parents 9233d65 + 272f2b0
commit 59cd6d0
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 57 deletions.
diff --git a/back/config/config.ini b/back/config/config.ini
@@ -1,17 +1,17 @@
 [FilteringOptions]
-StopWords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "you're", "you've", "you'll", "you'd", "your", "yours",
-              "yourself", "yourselves", "he", "him", "his", "himself", "she", "she's", "her", "hers", "herself", "it", "it's", "its",
-              "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "that'll",
-              "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does",
-              "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for",
-              "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from",
-              "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when",
-              "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not",
-              "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "don't", "should", "should've",
-              "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "aren't", "couldn", "couldn't", "didn", "didn't", "doesn",
-              "doesn't", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", "isn", "isn't", "ma", "mightn", "mightn't", "must",
-              "mustn", "mustn't", "needn", "needn't", "shan", "shan't", "shouldn", "shouldn't", "wasn", "wasn't", "weren", "weren't",
-              "won", "won't", "wouldn", "wouldn't"]
+StopWords = ["I", "ME", "MY", "MYSELF", "WE", "OUR", "OURS", "OURSELVES", "YOU", "YOU'RE", "YOU'VE", "YOU'LL", "YOU'D", "YOUR", "YOURS",
+             "YOURSELF", "YOURSELVES", "HE", "HIM", "HIS", "HIMSELF", "SHE", "SHE'S", "HER", "HERS", "HERSELF", "IT", "IT'S", "ITS",
+             "ITSELF", "THEY", "THEM", "THEIR", "THEIRS", "THEMSELVES", "WHAT", "WHICH", "WHO", "WHOM", "THIS", "THAT", "THAT'LL",
+             "THESE", "THOSE", "AM", "IS", "ARE", "WAS", "WERE", "BE", "BEEN", "BEING", "HAVE", "HAS", "HAD", "HAVING", "DO", "DOES",
+             "DID", "DOING", "A", "AN", "THE", "AND", "BUT", "IF", "OR", "BECAUSE", "AS", "UNTIL", "WHILE", "OF", "AT", "BY", "FOR",
+             "WITH", "ABOUT", "AGAINST", "BETWEEN", "INTO", "THROUGH", "DURING", "BEFORE", "AFTER", "ABOVE", "BELOW", "TO", "FROM",
+             "UP", "DOWN", "IN", "OUT", "ON", "OFF", "OVER", "UNDER", "AGAIN", "FURTHER", "THEN", "ONCE", "HERE", "THERE", "WHEN",
+             "WHERE", "WHY", "HOW", "ALL", "ANY", "BOTH", "EACH", "FEW", "MORE", "MOST", "OTHER", "SOME", "SUCH", "NO", "NOR", "NOT",
+             "ONLY", "OWN", "SAME", "SO", "THAN", "TOO", "VERY", "S", "T", "CAN", "WILL", "JUST", "DON", "DON'T", "SHOULD", "SHOULD'VE",
+             "NOW", "D", "LL", "M", "O", "RE", "VE", "Y", "AIN", "AREN", "AREN'T", "COULDN", "COULDN'T", "DIDN", "DIDN'T", "DOESN",
+             "DOESN'T", "HADN", "HADN'T", "HASN", "HASN'T", "HAVEN", "HAVEN'T", "ISN", "ISN'T", "MA", "MIGHTN", "MIGHTN'T", "MUST",
+             "MUSTN", "MUSTN'T", "NEEDN", "NEEDN'T", "SHAN", "SHAN'T", "SHOULDN", "SHOULDN'T", "WASN", "WASN'T", "WEREN", "WEREN'T",
+             "WON", "WON'T", "WOULDN", "WOULDN'T"]
 BlockWords = ["DIP", "", "$", "RH", "YOLO", "PORN", "BEST", "MOON", "HOLD", "FAKE", "WISH", "USD", "EV", "MARK", "RELAX", "LOL", "LMAO",
-               "LMFAO", "EPS", "DCF", "NYSE", "FTSE", "APE", "CEO", "CTO", "FUD", "DD", "AM", "PM", "FDD", "EDIT", "TA", "UK", "AMC", "GME"]
+              "LMFAO", "EPS", "DCF", "NYSE", "FTSE", "APE", "CEO", "CTO", "FUD", "DD", "AM", "PM", "FDD", "EDIT", "TA", "UK", "AMC", "GME"]
 Subreddits = ["robinhoodpennystocks","pennystocks"]
diff --git a/back/ticker_counts.py b/back/ticker_counts.py
@@ -3,8 +3,7 @@
 import json
 import re
 from collections import Counter, namedtuple
-from functools import reduce
-from operator import add
+from itertools import chain
 from pathlib import Path
 from typing import Set
 
@@ -16,39 +15,32 @@
 
 
 class TickerCounts:
-    WEBSCRAPER_LIMIT = 2000
-    config = configparser.ConfigParser()
-    config.read('./config/config.ini')
-    stop_words = json.loads(config['FilteringOptions']['StopWords'])
-    block_words = json.loads(config['FilteringOptions']['BlockWords'])
-    subreddits = json.loads(config['FilteringOptions']['Subreddits'])
-    with open('./config/tickers.json') as f:
-        tickers = json.load(f)
-
-    def verify_ticker(self, tick):
-        return tick in self.tickers
-
-    def extract_ticker(self, body: str, re_string: str = r'\$[A-Za-z]+|[A-Z]{2,}') -> Set[str]:
+
+    def __init__(self):
+        self.webscraper_limit = 2000
+        config = configparser.ConfigParser()
+        config.read('./config/config.ini')
+        self.subreddits = json.loads(config['FilteringOptions']['Subreddits'])
+
+        stop_words = set(json.loads(config['FilteringOptions']['StopWords']))
+        block_words = set(json.loads(config['FilteringOptions']['BlockWords']))
+        with open('./config/tickers.json') as f:
+            tickers = set(json.load(f))
+        exclude = stop_words | block_words
+        self.keep_tickers = tickers - exclude  # Remove words/tickers in exclude
+
+    def extract_ticker(self, text: str, pattern: str = r'(?<=\$)[A-Za-z]+|[A-Z]{2,}') -> Set[str]:
         """Simple Regex to get tickers from text."""
-        ticks = set(re.findall(re_string, str(body)))
-        res = set()
-        for item in ticks:
-            if item not in self.block_words and item.lower() not in self.stop_words and item:
-                try:
-                    tick = item.replace('$', '').upper()
-                    res.add(tick)
-                except Exception as e:
-                    print(e)
-        return res
+        ticks = set(re.findall(pattern, str(text)))
+        return ticks & self.keep_tickers  # Keep overlap
 
     def _get_posts(self):
-        # Scrape subreddits `r/robinhoodpennystocks` and `r/pennystocks`
-        # Current it does fetch a lot of additional data like upvotes, comments, awards etc but not using anything apart from title for now
+        # Scrape subreddits. Currently it fetches additional data, only using title for now
         reddit = praw.Reddit('ClientSecrets')
         subreddits = '+'.join(self.subreddits)
-        new_bets = reddit.subreddit(subreddits).new(limit=self.WEBSCRAPER_LIMIT)
+        new_bets = reddit.subreddit(subreddits).new(limit=self.webscraper_limit)
 
-        for post in tqdm(new_bets, desc='Selecting relevant data from webscraper', total=self.WEBSCRAPER_LIMIT):
+        for post in tqdm(new_bets, desc='Selecting relevant data from webscraper', total=self.webscraper_limit):
             yield Post(
                 post.id,
                 post.title,
@@ -61,21 +53,13 @@ def _get_posts(self):
     def get_data(self):
         df_posts = pd.DataFrame(self._get_posts())
 
-        # Extract tickers from all titles and create a new column
-        df_posts['Tickers'] = df_posts['title'].apply(self.extract_ticker)
-        tickers = df_posts['Tickers']
-
-        # Count number of occurrences of the Ticker and verify id the Ticker exists
-        counts = reduce(add, map(Counter, tickers))
-
-        verified_ticks = {}
-        for ticker, ticker_count in tqdm(counts.items(), desc='Filtering verified ticks'):
-            # If ticker is found more than 3 times and ticker is valid
-            if ticker_count > 3 and self.verify_ticker(ticker):
-                verified_ticks[ticker] = ticker_count
+        # Extract tickers from titles & count them
+        tickers = df_posts['title'].apply(self.extract_ticker)
+        counts = Counter(chain.from_iterable(tickers))
 
-        # Create Datable of just mentions
-        df_tick = pd.DataFrame(verified_ticks.items(), columns=['Ticker', 'Mentions'])
+        # Create DataFrame of just mentions & remove any occurring less than 3 or less
+        df_tick = pd.DataFrame(counts.items(), columns=['Ticker', 'Mentions'])
+        df_tick = df_tick[df_tick['Mentions'] > 3]
         df_tick = df_tick.sort_values(by=['Mentions'], ascending=False)
 
         data_directory = Path('./data')