In [1]:
import pandas as pd, numpy as np, sqlite3 as sql, datetime as dt, re, time, yfinance as yf, psutil
from dateutil.relativedelta import relativedelta
import nltk
import os, gc
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [6]:
from api_key import cryptor

In [1174]:
# Current tables 
with sql.connect('../data/interim/companies.db') as con:
        port = pd.read_sql(f"SELECT Date date, Open, High, Low, Close, Volume, Volatility, Turnover, symbol FROM daily ORDER BY date", con=con, parse_dates={'date': '%Y-%m-%d %H:%M:%S'})\
                .drop_duplicates(subset=['date', 'symbol'])
        recommends = pd.read_sql(f"SELECT Date date, symbol, Firm, new_grade, prev_grade, Action from recommendations ORDER BY Date", con=con, parse_dates={'date': '%Y-%m-%d %H:%M:%S'})
        arts =pd.read_sql("SELECT date, symbol, publisher, pos_sent, neu_sent, neg_sent, comp_sent FROM articles ORDER BY date", con=con, parse_dates={'date': '%Y-%m-%d %H:%M:%S'})
        crypt_arts = pd.read_sql("SELECT date, symbol, publisher,pos_sent, neu_sent, neg_sent, comp_sent  FROM news_sentiment ORDER BY date", con=con, parse_dates={'date': '%Y-%m-%d %H:%M:%S'})
        articles = pd.concat([arts, crypt_arts], axis=0, ignore_index=True)
        comments = pd.read_sql(f"SELECT DATE(timestamp) date, channel, symbols, pos_sent, neu_sent, neg_sent, comp_sent from symbol_comments ORDER BY timestamp", parse_dates={'date': '%Y-%m-%d'}, con=con)
        comments.loc[:, "symbols"] = comments.symbols.apply(lambda x: x.replace('BTC', 'BTC-USD'))
        companies = tuple(port.symbol.unique())
        c_data = pd.read_sql(f"SELECT * from mentions WHERE symbol IN {companies}", con=con, index_col='pk')

In [1175]:
# Comments 
symbols_re = re.compile(r"\[|\]|\'|\'")
last_index = comments.index.max()

In [1165]:
# Comments
# decompose for single symbol / use explode
for i, row in comments.iterrows():
    symbols = re.sub(symbols_re, "", row.symbols)
    symbols = symbols.split(',')
    for sym in symbols:
        last_index+=1
        comments.loc[last_index, ["symbols"]] = sym
        comments.loc[last_index, ["comment_index"]] = i
        comments.loc[last_index, ["date", "channel", "pos_sent", "neu_sent", "neg_sent", "comp_sent"]] = row.date, row.channel, row.pos_sent, row.neu_sent,  row.neg_sent, row.comp_sent


In [None]:
# Comments
comments.assign(sym = lambda x: x.symbols.apply(lambda x: re.sub(symbols_re, '', x)).apply(str.split, sep=',')).explode('sym').reset_index()

In [1166]:
# Comments
comments = comments[lambda x:~( x.comment_index.isnull())]
comments = comments[lambda x: x.symbols.isin(companies)]

In [1167]:
# Analyst Recommendations
recommendsDict = {"Very Bearish": 1, "Bearish": 2, "Neutral": 3, "Bullish": 4, "Very Bullish": 5}

In [1168]:
# Analyst Recommendations
recommends=recommends.assign(new_sent = lambda x: x.new_grade.apply(lambda g: recommendsDict[g]))\
    .assign(prev_sent = lambda x: x.prev_grade.apply(lambda g: recommendsDict[g]))

In [1169]:
# Financial Data
port.columns

Index(['date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Volatility',
       'Turnover', 'symbol'],
      dtype='object')

In [None]:
# Financial Data
# Add Recent Month's Financial Data 
def create_daily_data(ticker):
    tick = yf.ticker.Ticker(ticker)
    historical_data = tick.history("1mo")
    outstanding = tick.info.get("sharesOutstanding")
    if outstanding == None:
        outstanding = 1
    daily_close = historical_data["Close"]
    pct_change = daily_close.pct_change().fillna(0)
    periods = 2
    # calc volatility
    vola = (pct_change.rolling(periods).std() * np.sqrt(periods)).fillna(0)
    historical_data = historical_data.assign(Volatility = vola)
    historical_data = historical_data.assign(Turnover = lambda x: x.Volume / outstanding)
    historical_data = historical_data.assign(symbol = ticker)
    return historical_data.reset_index().round({"Volatility": 6, "Turnover": 6})

In [1170]:
# Custom Class: Originally labeled to be Evaluate Aggregate Trade, but ended up as Agg and Trade; Web App and its visualizations will be the evaluation tools  
# take aggregations over a frequency time period; make buying decisions based off the frequency of data points and sentiments
# return port with new information: shares and cost * shares
class EAT():
    def __init__(self, portfolio, articles, comments, recs, start, end):
        self.portfolio = portfolio.copy(deep=True)
        self.postions = []
        self.start = start
        self.end = end
        self.articles = articles[lambda x: (x.date >= start) & (x.date <= end)]
        self.comments =  comments[lambda x: (x.date >= start) & (x.date <= end)]
        self.recs = recs[lambda x: (x.date >= start) & (x.date <= end)]

        self.aggs = {}

    def aggregate(self):
        articles_agg = self.articles.groupby([pd.Grouper(key="date", freq="1Y"), 'symbol'])\
            .agg({'pos_sent': ['mean'], 'neg_sent': ['mean'], 'neu_sent': ['mean'], 'comp_sent': ['mean', 'count']}).assign(type=lambda x: 'News')
        comments_agg = self.comments.groupby([pd.Grouper(key="date", freq="1Y"), 'symbols'])\
            .agg({'pos_sent': ['mean'], 'neg_sent': ['mean'], 'neu_sent': ['mean'], 'comp_sent': ['mean', 'count']}).assign(type=lambda x: 'Chats')
        recommends_agg = self.recs.groupby([pd.Grouper(key="date", freq="1Y"), 'symbol'])\
            .agg({'new_sent': ['mean'], 'prev_sent': ['mean', 'count']}).assign(type=lambda x: 'Analysts')
        recommends_agg = recommends_agg.reset_index()
        comments_agg = comments_agg.reset_index()
        articles_agg = articles_agg.reset_index()
        recommends_agg.columns = recommends_agg.columns.droplevel(1)
        comments_agg.columns = comments_agg.columns.droplevel(1)
        articles_agg.columns = articles_agg.columns.droplevel(1)
        
        articles_agg.columns = ['date', 'symbol', 'pos_sent', 'neg_sent', 'neu_sent', 'comp_sent',
       'counts', 'type']
        comments_agg.columns = ['date', 'symbol', 'pos_sent', 'neg_sent', 'neu_sent', 'comp_sent',
       'counts', 'type']
        recommends_agg.columns = ['date', 'symbol', 'new_sent', 'prev_sent', 'counts', 'type']
        # comments_agg=comments_agg.assign(date = lambda x: x.date.apply(lambda x: x.date))
        self.aggs['recommendations'] = recommends_agg
        self.aggs['articles'] = articles_agg
        self.aggs['comments'] = comments_agg
        return None 


    def tradeSents(self, agg, label, min_samples, min_comp_sent, shares):
        # add action, shares, cost
        returns = self.aggs[agg][lambda x: (x.date >= self.start) & (x[label] >= min_comp_sent) & (x.counts >= min_samples)]
        # query portfolio for first cost add columns
        indexes = pd.Int64Index([])
        for date, sym in returns.loc[:, ['date', 'symbol']].values:
            # ns = returns[lambda x: x.date == date].shape[0]
            if sym not in self.postions:
                self.postions.append(sym)
                f1_date = (date + relativedelta(years=1)).to_pydatetime()
                indexes = self.portfolio[lambda x: ((x.date > date) & (x.symbol == sym) & (x.date <= f1_date))].index
                self.portfolio.loc[indexes, "shares"] = shares
            else:
                self.postions.append(sym)
                f1_date = (date + relativedelta(years=1)).to_pydatetime()
                indexes = self.portfolio[lambda x: ((x.date > date) & (x.symbol == sym) & (x.date <= f1_date))].index
                self.portfolio.loc[indexes, "shares"] = shares * self.postions.count(sym)
            
            i = returns[lambda x: (x.date == date) & (x.symbol == sym)].index
            if not indexes.empty:
                returns.loc[i, 'cost'] = shares * self.portfolio.loc[indexes[0], "Close"]
                returns.loc[i, 'returns'] = shares * self.portfolio.loc[indexes[-1], "Close"]
            else:
                indexes = self.portfolio[lambda x: (x.symbol == sym)].index
                returns.loc[i, 'cost'] = shares * self.portfolio.loc[indexes[-1], "Open"]
                returns.loc[i, 'returns'] = shares * self.portfolio.loc[indexes[-1], "Close"]

        return self.portfolio#returns
        


In [1171]:
# Custom Class
eat = EAT(port, articles, comments, recommends, dt.datetime(2018, 1, 1), dt.datetime(2022, 1, 30))

In [1172]:
# Custom Class; Must be called first
eat.aggregate()

In [1173]:
# Custom Class
# Trade Based on Text Data Sources
eat.tradeSents("comments", "comp_sent", min_samples=1, min_comp_sent=0.15, shares=10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0,date,Open,High,Low,Close,Volume,Volatility,Turnover,symbol,shares
0,2017-01-03,42.972000,44.066002,42.192001,43.397999,29616500,0.000000,2.950000e-02,TSLA,
1,2017-01-03,11.420000,11.650000,11.020000,11.430000,55182000,0.000000,4.570000e-02,AMD,
2,2017-01-03,27.250112,27.374832,27.005378,27.332474,115127600,0.000000,7.000000e-03,AAPL,
3,2017-01-03,20.639999,21.840000,20.532000,21.360001,73033,0.000000,4.000000e-04,ACB,
4,2017-01-03,89.000000,89.000000,88.080002,88.599998,8789400,0.000000,3.200000e-03,BABA,
...,...,...,...,...,...,...,...,...,...,...
98728,2022-01-28,24.003000,25.709999,22.809999,25.639999,21496600,0.101186,7.133600e-02,PTON,
98729,2022-01-28,20.660000,21.320000,19.309999,20.900000,96497500,0.058454,6.066900e-02,NIO,
98730,2022-01-28,1.850000,1.870000,1.750000,1.780000,2441000,0.035781,2.085600e-02,SOLO,
98731,2022-01-29,37780.714844,38576.261719,37406.472656,38138.179688,17194183075,0.008032,1.719418e+10,BTC-USD,


In [956]:
# Custom Class
ret = eat.tradeSents("articles", "comp_sent", 100, 0.5, 10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [957]:
# Custom Class
ret = eat.tradeSents("recommendations", "new_sent", 25, 4, 10)

In [958]:
# eat.tradeSents("articles", "comp_sent", 100, 0.5, 10)
# eat.tradeSents("recommendations", "new_sent", 25, 4, 10)

In [1152]:
# Custom Class
# Analyst Only Strategy Returns
ret.groupby('date').sum().assign(r_pct = lambda x: (x.returns - x.cost) / x.cost)

Unnamed: 0_level_0,pos_sent,neg_sent,neu_sent,comp_sent,counts,cost,returns,r_pct
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-12-31,5.265895,0.623934,15.110028,8.133532,93,42329.746435,67754.449959,0.600634
2020-12-31,4.164832,0.921737,24.733701,5.878339,1405,330494.633604,502545.053391,0.520585


In [1153]:
# Custom Class
# News Only Strategy Returns
ret = eat.tradeSents("articles", "comp_sent", 100, 0.5, 10)
ret.groupby('date').sum().assign(r_pct = lambda x: (x.returns - x.cost) / x.cost)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0_level_0,pos_sent,neg_sent,neu_sent,comp_sent,counts,cost,returns,r_pct
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-12-31,1.886672,0.725528,19.387871,12.806051,6245,363631.928234,537816.95425,0.479015
2021-12-31,2.63562,0.961372,26.403193,18.72214,9305,90168.539557,73207.400317,-0.188105


In [1157]:
# Custom Class
ret = eat.tradeSents("recommendations", "new_sent", 25, 4, 10)
ret.groupby('date').sum().assign(r_pct = lambda x: (x.returns - x.cost) / x.cost)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0_level_0,new_sent,prev_sent,counts,cost,returns,r_pct
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-12-31,29.418501,29.140315,280,21844.340649,27577.331295,0.262447
2019-12-31,33.888142,33.519803,355,30153.087006,47727.473297,0.582839
2020-12-31,67.845811,67.276012,993,57397.397575,62125.701008,0.082378
2021-12-31,58.988827,58.416372,694,67120.639572,56591.00069,-0.156876


In [47]:
with sql.connect('../data/interim/companies.db') as con:
        port =pd.read_sql("SELECT * FROM daily ORDER BY date", con=con, parse_dates={'date': '%Y-%m-%d %H:%M:%S'})
        arts =pd.read_sql("SELECT * FROM articles ORDER BY date", con=con, parse_dates={'date': '%Y-%m-%d %H:%M:%S'})
        crypt_arts = pd.read_sql("SELECT * FROM crypt_articles ORDER BY date", con=sql.connect('../data/raw/crypt.db'))
        #articles = pd.concat([arts, crypt_arts], axis=0, ignore_index=True)
        comments = pd.read_sql(f"SELECT DATE(timestamp) date, channel, symbols, pos_sent, neu_sent, neg_sent, comp_sent from symbol_comments ORDER BY timestamp", parse_dates={'date': '%Y-%m-%d'}, con=con)
        comments.loc[:, "symbols"] = comments.symbols.apply(lambda x: x.replace('BTC', 'BTC-USD'))
        companies = tuple(port.symbol.unique())
        c_data = pd.read_sql(f"SELECT * from mentions WHERE symbol IN {companies}", con=con, index_col='pk')

In [48]:
# News Articles 
# Decrypt Byte-Encoded Data
decrypt_cols = [x for x in crypt_arts.columns if x not in ['pk', 'pos_sent', "neu_sent", "neg_sent", "comp_sent"]]
for col in decrypt_cols:
    crypt_arts.loc[:, col] = crypt_arts.loc[:, col].apply(bytes).apply(cryptor.decrypt).apply(str, encoding='utf-8')
    if col == 'date':
        crypt_arts.loc[:, col] = crypt_arts.loc[:, col].apply(str.split, sep=" ").apply(lambda x: x[0]).apply(pd.to_datetime)

In [49]:
# News Articles
crypt_arts

Unnamed: 0,pk,article,headline,comments,date,link,symbol,publisher,pos_sent,neg_sent,neu_sent,comp_sent
0,9,The Transportation Security Administration say...,TSA extends COVID mask rule for U.S. transport...,49,2021-08-17,https://seekingalpha.com/news/3731674-tsa-exte...,AAL,Seeking Alpha,0.042,0.042,0.916,0.2544
1,5705,"Parler, the social network popular with conser...",Parler comes back online after a month off the...,249,2021-02-16,https://seekingalpha.com/news/3662032-parler-c...,FB,Seeking Alpha,0.082,0.039,0.879,0.9363
2,1709,Amazon (NASDAQ:AMZN): Q1 GAAP EPS of $15.79 be...,"Amazon EPS beats by $6.18, beats on revenue",115,2021-04-29,https://seekingalpha.com/news/3688132-amazon-e...,AMZN,Seeking Alpha,0.065,0.000,0.935,0.7184
3,3240,"Blink Charging (BLNK +7.4%), EVgo (EVGO +8.4%)...",Electric vehicle battery stocks break higher a...,28,2021-07-28,https://seekingalpha.com/news/3720809-electric...,BLNK,Seeking Alpha,0.047,0.038,0.916,0.1280
4,5968,Teenagers across the U.S. are looking to Redmo...,TikTok rescue by Microsoft could make sense - ...,93,2020-08-01,https://seekingalpha.com/news/3598676-tiktok-r...,FB,Seeking Alpha,0.114,0.033,0.853,0.8934
...,...,...,...,...,...,...,...,...,...,...,...,...
15321,11885,"The Associated Press, NBC, Edison Research, Fo...",Joe Biden wins U.S. presidential election,23654,2020-11-07,https://seekingalpha.com/news/3633355-joe-bide...,QQQ,Seeking Alpha,0.059,0.048,0.893,0.0258
15322,9323,Animal spirits are firing up again on red-hot ...,Electric vehicle names head higher led by Kand...,126,2020-11-23,https://seekingalpha.com/news/3638678-electric...,NIO,Seeking Alpha,0.033,0.046,0.921,-0.2263
15323,2573,Boeing (BA +3.1%) bounces sharply higher after...,Boeing sales outpaced cancellations last month...,10,2021-03-09,https://seekingalpha.com/news/3670770-boeing-s...,BA,Seeking Alpha,0.021,0.066,0.913,-0.7005
15324,7455,"Workers in Arizona, Oregon, and New Mexico hav...",Intel workers file coronavirus safety complaints,15,2020-05-08,https://seekingalpha.com/news/3572138-intel-wo...,INTC,Seeking Alpha,0.133,0.087,0.779,0.9112


In [50]:
# News Articles
arts = arts.assign(comments = lambda x: 0)
arts = arts.loc[:, ['pk', 'article', 'title', 'comments','date', 'link', 'symbol', 'publisher','pos_sent', 'neg_sent', 'neu_sent', 'comp_sent']]

In [51]:
# News Articles
# nongreedy
regex = re.compile(r"© Reuters.+?(-|—)")

In [52]:
# News Articles
arts = arts.assign(article = lambda x: x.article.apply(lambda w: re.sub(regex, "", w)).apply(str.strip))

In [53]:
# News Articles
# Conform Columns
arts = arts.rename(columns={'title': 'headline'})

In [54]:
# News Articles
# Dummy Site Variable (seeking alpha's site or not)
arted = pd.concat([crypt_arts.assign(sa=lambda x: 1), arts.assign(sa=lambda x: 0)], axis=0, ignore_index=True)

In [55]:
# NLP
stop_words = set(nltk.corpus.stopwords.words("english"))
stop_words = stop_words.union({'said', 'us', 'also', 'inc', 'could', 'word', 'b', 'q', })

In [56]:
# NLP
def wordCounts(df, comment_array):
    comment_list = []
    for comment in comment_array:
        comment_dictionary = {} 
        punc_regex = r"[\，\’\'\'\"\"\“\”!\?@#$%&\(\)\*,-.\\\{\}+~\/:;<>\[\]^`|=_]"
        reg_bad_quotes = re.compile(u"[\x94\x93\x92\x91]")
        comment = comment.lower()
        comment = re.sub(punc_regex, " ", comment)
        comment = re.sub(reg_bad_quotes, " ", comment)
        words = nltk.tokenize.word_tokenize(comment)
        for w in words:
            if (w in comment_dictionary.keys()) & (w not in stop_words):
                comment_dictionary[w] += 1
            elif (w not in stop_words):
                comment_dictionary[w] = 1
        comments_words = dict(sorted(comment_dictionary.items(), key=lambda x: x[1], reverse=True))
        comment_list.append(comments_words)
    
    df = df.assign(word_obj = pd.Series(comment_list))
    return df

# word counts sans stopwords

In [57]:
# NLP
wordy_df = wordCounts(arted, arted.article.values)

In [58]:
# NLP
wordy_df2 = wordy_df.copy(deep=True)

In [59]:
# NLP
wordy_df2 = wordy_df2.assign(article = lambda x: x.article.apply(lambda line: re.sub(r'(?<=[.,])(?=[^\s])', r' ', line)))

In [60]:
# NLP
wordy_df3 = wordy_df2.assign(word_obj = lambda x: x.word_obj.apply(str))

In [61]:
#wordy_df3.to_sql('temp_table', con=sql.connect('temp.db'), if_exists='replace')

In [62]:
# NLP
wordy_df3

Unnamed: 0,pk,article,headline,comments,date,link,symbol,publisher,pos_sent,neg_sent,neu_sent,comp_sent,sa,word_obj
0,9,The Transportation Security Administration say...,TSA extends COVID mask rule for U.S. transport...,49,2021-08-17,https://seekingalpha.com/news/3731674-tsa-exte...,AAL,Seeking Alpha,0.042,0.042,0.916,0.2544,1,"{'transportation': 2, 'administration': 2, 'sa..."
1,5705,"Parler, the social network popular with conser...",Parler comes back online after a month off the...,249,2021-02-16,https://seekingalpha.com/news/3662032-parler-c...,FB,Seeking Alpha,0.082,0.039,0.879,0.9363,1,"{'parler': 7, 'users': 4, 'platform': 3, 'skys..."
2,1709,Amazon (NASDAQ:AMZN): Q1 GAAP EPS of $15. 79 b...,"Amazon EPS beats by $6.18, beats on revenue",115,2021-04-29,https://seekingalpha.com/news/3688132-amazon-e...,AMZN,Seeking Alpha,0.065,0.000,0.935,0.7184,1,"{'billion': 6, '5': 4, 'revenue': 3, '108': 3,..."
3,3240,"Blink Charging (BLNK +7. 4%), EVgo (EVGO +8. 4...",Electric vehicle battery stocks break higher a...,28,2021-07-28,https://seekingalpha.com/news/3720809-electric...,BLNK,Seeking Alpha,0.047,0.038,0.916,0.1280,1,"{'infrastructure': 5, 'bill': 5, '7': 4, 'ev':..."
4,5968,Teenagers across the U. S. are looking to Redm...,TikTok rescue by Microsoft could make sense - ...,93,2020-08-01,https://seekingalpha.com/news/3598676-tiktok-r...,FB,Seeking Alpha,0.114,0.033,0.853,0.8934,1,"{'would': 5, 'tiktok': 4, 'microsoft': 4, 'mak..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26805,7584,Stock in gaming and e-commerce firm Sea (NYSE:...,Sea Slumps as Tencent Moves to Cut Voting Stak...,0,2022-01-04,https://www.investing.com/news/stock-market-ne...,SE,Investing.com,0.084,0.046,0.870,0.7867,0,"{'tencent': 5, 'sea': 4, 'voting': 4, 'stake':..."
26806,1625,"On Wednesday, Palantir Technologies Inc (NYSE:...",Palantir Technologies Announces Collaboration ...,0,2022-01-05,https://www.investing.com/news/stock-market-ne...,PLTR,Investing.com,0.143,0.015,0.842,0.9630,0,"{'palantir': 6, 'platform': 5, 'hhi': 4, 'big'..."
26807,7583,Asia Pacific stocks were down on Thursday morn...,"Asian Stocks Down, Positive Chinese Data Fails...",0,2022-01-05,https://www.investing.com/news/stock-market-ne...,SE,Investing.com,0.068,0.034,0.898,0.9201,0,"{'services': 4, '1': 4, 'day': 4, 'u': 3, '0':..."
26808,9547,"The S&P 500 closed down Friday, marking its wo...",S&P 500 in Big Weekly Loss as Tech Bulls Scatt...,0,2022-01-07,https://www.investing.com/news/stock-market-ne...,ON,Investing.com,0.096,0.049,0.856,0.9733,0,"{'stocks': 5, 'nasdaq': 5, 'tech': 4, 'rate': ..."


In [63]:
# NLP
# encrypted encoding was utf8 and the .htm file encodings were latin-1 
bad_encode_regex = re.compile(u"[\x94\x93\x92\x91]")
wordy_df2 = wordy_df2.assign(article = lambda x: x.article.apply(lambda line: re.sub(r'(?<=[.,])(?=[^\s])', r' ', line)))
wordy_df2 = wordy_df2.assign(article = lambda x: x.article.apply(lambda line: re.sub(bad_encode_regex, r' ', line)))

In [64]:
# NLP
def wordCounts(comment_array):
    comment_dictionary = {} 
    for comment in comment_array:
        punc_regex = r"[\，\’\'\'\"\"\“\”!\?@#$%&\(\)\*,-.\\\{\}+~\/:;<>\[\]^`|=_]"
        reg_bad_quotes = re.compile(u"[\x94\x93\x92\x91]")
        comment = comment.lower()
        comment = re.sub(punc_regex, " ", comment)
        comment = re.sub(reg_bad_quotes, " ", comment)
        comment = re.sub(r"[0-9]", "", comment)
        words = nltk.tokenize.word_tokenize(comment)
        for w in words:
            if (w in comment_dictionary.keys()) & (w not in stop_words):
                comment_dictionary[w] += 1
            elif (w not in stop_words):
                comment_dictionary[w] = 1
    
    comments_words = dict(sorted(comment_dictionary.items(), key=lambda x: x[1], reverse=True))
    # comment_list.append(comments_words)

    # df = df.assign(word_obj = pd.Series(comment_list))
    return pd.Series(comments_words)

# word counts sans stopwords

In [65]:
# News Articles
# NLP
positive_article_words = wordCounts(wordy_df2[lambda x: x.comp_sent >= 0].article.values)

In [66]:
# News Articles
# NLP
negative_article_words = wordCounts(wordy_df2[lambda x: x.comp_sent < 0].article.values)

In [67]:
# News Articles
# NLP
negative_article_words = negative_article_words.reset_index().rename(columns={'index': 'word', 0: 'neg_counts'})

In [68]:
# News Articles
# NLP
positive_article_words = positive_article_words.reset_index().rename(columns={'index': 'word', 0: 'pos_counts'})

In [69]:
# News Articles
# NLP
positive_article_words[lambda x: x.word=='draftkings']

Unnamed: 0,word,pos_counts
739,draftkings,1114


In [70]:
# News Articles
# NLP
# the media loves draftkings (DKNG)
negative_article_words[lambda x: x.word == 'draftkings']

Unnamed: 0,word,neg_counts
2779,draftkings,56


In [71]:
# News Articles
# NLP
positive_example = wordCounts(wordy_df2[lambda x: (x.symbol == 'DKNG') & (x.comp_sent >= 0)].article).reset_index().rename(columns={'index': 'word', 0: 'neg_counts'})

In [72]:
# News Articles
# NLP
wordy_df2[lambda x: (x.date <= dt.datetime(2021, 2, 1)) & (x.date >= dt.datetime(2021, 1, 1))].groupby('symbol').count().sort_values('date').loc['TSLA', :]

pk           115
article      115
headline     115
comments     115
date         115
link         115
publisher    115
pos_sent     115
neg_sent     115
neu_sent     115
comp_sent    115
sa           115
word_obj     115
Name: TSLA, dtype: int64

In [73]:
# News Articles
# NLP
positive_article_words.merge(how='outer', right=negative_article_words).fillna(0).\
    assign(pos_pct = lambda x: x.pos_counts / (x.pos_counts + x.neg_counts)).\
        assign(neg_pct = lambda x: x.neg_counts / (x.pos_counts + x.neg_counts)).sort_values('neg_pct').\
            assign(counts = lambda x: (x.pos_counts + x.neg_counts)).sort_values('counts', ascending=False)


Unnamed: 0,word,pos_counts,neg_counts,pos_pct,neg_pct,counts
0,nasdaq,57260.0,12043.0,0.826227,0.173773,69303.0
1,nyse,41399.0,8421.0,0.830971,0.169029,49820.0
4,u,20841.0,6841.0,0.752872,0.247128,27682.0
2,company,21977.0,3720.0,0.855236,0.144764,25697.0
3,year,21347.0,3856.0,0.847002,0.152998,25203.0
...,...,...,...,...,...,...
51564,synbiotic,1.0,0.0,1.000000,0.000000,1.0
51563,minnett,1.0,0.0,1.000000,0.000000,1.0
51562,ord,1.0,0.0,1.000000,0.000000,1.0
51561,wfm,1.0,0.0,1.000000,0.000000,1.0


In [74]:
# NLP
# Sentiment Scores
sia = SentimentIntensityAnalyzer()

In [75]:
# NLP
# Sentiment Scores
def sentiment_art(art):
    return sia.polarity_scores(art)

In [76]:
# News Articles
# NLP
wordy_df_news = wordy_df2.assign(sentiment_dict = lambda x : x.article.apply(sentiment_art))

In [77]:
# News Articles
# NLP
wordy_df_news= wordy_df_news.assign(pos_sent = lambda x: x.sentiment_dict.apply(lambda y: y['pos']))\
    .assign(neg_sent = lambda x: x.sentiment_dict.apply(lambda y: y['neg']))\
        .assign(neu_sent = lambda x: x.sentiment_dict.apply(lambda y: y['neu']))\
            .assign(comp_sent = lambda x: x.sentiment_dict.apply(lambda y: y['compound']))

In [78]:
# News Articles
# NLP
wordy_df_news.drop('sentiment_dict', axis=1, inplace=True)

In [79]:
# News Articles
# NLP
wordy_df_news.groupby(pd.Grouper(key='date', freq="1M")).count()

Unnamed: 0_level_0,pk,article,headline,comments,link,symbol,publisher,pos_sent,neg_sent,neu_sent,comp_sent,sa,word_obj
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2018-01-31,1,1,1,1,1,1,1,1,1,1,1,1,1
2018-02-28,0,0,0,0,0,0,0,0,0,0,0,0,0
2018-03-31,0,0,0,0,0,0,0,0,0,0,0,0,0
2018-04-30,0,0,0,0,0,0,0,0,0,0,0,0,0
2018-05-31,0,0,0,0,0,0,0,0,0,0,0,0,0
2018-06-30,1,1,1,1,1,1,1,1,1,1,1,1,1
2018-07-31,0,0,0,0,0,0,0,0,0,0,0,0,0
2018-08-31,1,1,1,1,1,1,1,1,1,1,1,1,1
2018-09-30,0,0,0,0,0,0,0,0,0,0,0,0,0
2018-10-31,2,2,2,2,2,2,2,2,2,2,2,2,2


In [88]:
# Comments
# Stock Comments
comments#.groupby(pd.Grouper(key='date', freq='1M')).count()

Unnamed: 0,date,channel,symbols,pos_sent,neu_sent,neg_sent,comp_sent
0,2019-08-08,wetlqd-ideas,['AMD'],0.293,0.707,0.000,0.4404
1,2019-08-13,wetlqd-ideas,['MSFT'],0.182,0.818,0.000,0.4404
2,2019-08-13,wetlqd-ideas,['MSFT'],0.294,0.706,0.000,0.3612
3,2019-08-14,wetlqd-ideas,"['ON', 'MA', 'ING']",0.000,0.597,0.403,-0.4019
4,2019-08-14,wetlqd-ideas,['CRM'],0.000,1.000,0.000,0.0000
...,...,...,...,...,...,...,...
3389,2020-12-08,option-trading,['OOOO'],0.000,1.000,0.000,0.0000
3390,2020-12-08,option-trading,['ING'],0.900,0.100,0.000,0.8402
3391,2020-12-08,trading,['FSLY'],0.000,1.000,0.000,0.0000
3392,2020-12-08,trading,['IT'],0.000,1.000,0.000,0.0000


In [846]:
with sql.connect('../data/interim/discord/discord.db') as con:
    full_comments =pd.read_sql(f"SELECT DATE(timestamp) date, timestamp,channel, id, server, pk, content, isBot, mentions, emojis, links, chat_emotes from comments ORDER BY timestamp", parse_dates={'date': '%Y-%m-%d'}, con=con)
with sql.connect('../data/interim/companies.db') as con:
    wanted_companies = pd.read_sql("SELECT DISTINCT symbol FROM daily WHERE symbol NOT IN ('PT', 'IT', 'ON', 'ING')", con=con).symbol.values
    wanted_companies = tuple(wanted_companies)
    mentions = pd.read_sql(f"SELECT * from mentions WHERE symbol IN {wanted_companies}", con=con, index_col='pk')

In [849]:
# Comments
full_comments = full_comments.assign(timestamp=lambda x: x.timestamp.apply(pd.to_datetime))

In [850]:
# Comments
userIdCatcher = r"(<@(!)?\d+>|@everyone)"
discord_emote_regex = r"<:.+:\d+>"
url_regex = r"https*\:.+|www\..+"
punc_regex = r"[\’\'\'\"\"\“\”!\?@#$%&\(\)\*,-.\\\{\}+~\/:;<>\[\]^`|=_’]"
contract_regex = r"[\'\’\’]"

In [5]:
# Parse Comments
# Comments
def clean_comment(comment):
    userIdCatcher = r"(<@(!)?\d+>|@everyone)"
    discord_emote_regex = r"<:.+:\d+>"
    url_regex = r"https*\:.+|www\..+"
    punc_regex = r"[\’\'\'\"\"\“\”!\?@#$%&\(\)\*,-.\\\{\}+~\/:;<>\[\]^`|=_’]"
    contract_regex = r"[\'\’\’]"
    comment = re.sub("|".join([x for x in [userIdCatcher, discord_emote_regex, url_regex]]), "", comment)
    comment = re.sub(contract_regex, "", comment)
    comment = re.sub(punc_regex, " ", comment)
    comment = comment.replace("\n", " ").replace("'s", "")
    return " " + comment + " "

In [852]:
# Comments
full_comments = full_comments.assign(cc_comment = lambda x: x.content.apply(clean_comment).apply(str.lower))
full_comments = full_comments.assign(st_comment = lambda x: x.cc_comment.apply(sentiment_art))
full_comments = full_comments.assign(pos_sent = lambda x: x.st_comment.apply(lambda y: y['pos']))\
    .assign(neg_sent = lambda x: x.st_comment.apply(lambda y: y['neg']))\
        .assign(neu_sent = lambda x: x.st_comment.apply(lambda y: y['neu']))\
            .assign(comp_sent = lambda x: x.st_comment.apply(lambda y: y['compound'])).drop(['st_comment'], axis=1)

In [853]:
# Comments
company_set = set(x.lower() for x in mentions.symbol.values)

In [854]:
# Comments
def list_to_symbols(list_obj, set=company_set):
    symbols = []
    for l in list_obj:
        if l in set:
            symbols.append(l)
    return symbols

In [855]:
# Comments
full_comments = full_comments.assign(symbols = lambda x: x.cc_comment.apply(nltk.tokenize.word_tokenize).apply(lambda z: list_to_symbols(z)).apply(str))

In [856]:
# Comments
emote_agg = {}
def agg_(comma_companies, agg_dict):
    comma_companies = re.sub(r'[\[\]\'\']', "", comma_companies)
    for e in comma_companies.split(","):
        e = e.strip()
        if e in agg_dict.keys():
            emote_agg[e] += 1
        else:
            emote_agg[e] = 1

In [857]:
# Comments
full_comments.symbols.apply(lambda x: agg_(x, emote_agg))

0        None
1        None
2        None
3        None
4        None
         ... 
31929    None
31930    None
31931    None
31932    None
31933    None
Name: symbols, Length: 31934, dtype: object

In [858]:
# Comments
new_mentions = pd.Series(emote_agg).reset_index().iloc[1:, :].sort_values(0, ascending=False).reset_index(drop=True).rename(columns={'index': 'symbol', 0:'counts'})

In [859]:
# Comments
# Initial Counts and Additional Data Garnered
mentions.merge(new_mentions.assign(symbol=lambda x: x.symbol.apply(str.upper)), 'left', on='symbol').iloc[:29, :]

Unnamed: 0,symbol,counts_x,counts_y
0,SPY,167,313
1,TSLA,166,253
2,NIO,151,232
3,AMD,133,174
4,AAPL,114,174
5,DKNG,75,86
6,PLTR,74,99
7,ACB,60,74
8,ZM,59,69
9,BABA,55,105


In [860]:
# Comments
# NLP
comment_token_dict = {}

In [861]:
# Comments
# NLP
tokenSeries = wordCounts(full_comments.cc_comment.values).reset_index().rename(columns={'index': 'word', 0:'counts'})

In [862]:
# Comments
# NLP
commentsPositiveSeries = wordCounts(full_comments[lambda x: x.comp_sent >= 0.0].cc_comment.values).reset_index().rename(columns={'index': 'word', 0:'counts'})
commentsNegativeSeries = wordCounts(full_comments[lambda x: x.comp_sent <= 0.0].cc_comment.values).reset_index().rename(columns={'index': 'word', 0:'counts'})

In [863]:
# Comments
# NLP
commentsNegativeSeries[lambda x: x.word == 'spy']

Unnamed: 0,word,counts
52,spy,176


In [864]:
# Comments
# NLP
commentsPositiveSeries[lambda x: x.word == 'spy']

Unnamed: 0,word,counts
62,spy,269


In [865]:
# Comments
# NLP
mentions.symbol.values

array(['SPY', 'TSLA', 'NIO', 'AMD', 'AAPL', 'DKNG', 'PLTR', 'ACB', 'ZM',
       'BABA', 'FB', 'BA', 'WKHS', 'SQ', 'BYND', 'NVDA', 'IDEX', 'AMZN',
       'PFE', 'MSFT', 'NKLA', 'GNUS', 'FSLY', 'PTON', 'RSI', 'ROKU',
       'PINS', 'CRSR', 'PLUG', 'WWR', 'HYLN', 'SNDL', 'JD', 'SNAP', 'RKT',
       'ES', 'JKS', 'NNDM', 'GME', 'SE', 'MA', 'SQQQ', 'KO', 'MARA',
       'HEAR', 'CRM', 'CGC', 'IZEA', 'CBAT', 'VXX', 'INTC', 'SPCE', 'AAL',
       'SPI', 'AREC', 'NFLX', 'QQQ', 'UONE', 'BLNK', 'HD', 'DIS', 'MO',
       'TLRY', 'ADTX', 'LMNL', 'JMIA', 'EXAS', 'SOLO', 'XERS', 'DPW',
       'GILD', 'VLDR'], dtype=object)

In [6]:
# Comments
# NLP
# Adding the single tokens that represent the security
more_terms = {'SPY': ['SPDR', 'SP', 'SP500', 'SPY'],
'TSLA': ['Tesla', 'TSLA'],
'NIO': ['Nio', 'NIO'], 'AMD': ['AMD'], 'AAPL': ['Apple', 'AAPL'],
'DKNG': ['DraftKings', 'DKNG'],
'PLTR': ['Palantir', 'PLTR'], 'ACB':['Aurora', 'Aurora', 'PLTR'], 'ZM':['ZM', 'Zoom'],
'BABA':['Alibaba', 'BABA'], 'FB':['Facebook'], 'BA':['Boeing', 'BA'],
'WKHS':['Workhorse', 'WKHS'],
'SQ':['Square', 'SQ'], 'BYND':['Beyond Meat', 'bynd'],
'NVDA':['NVDA', 'Nvidia'], 'IDEX':['IDEX', 'Ideanomics'], 'AMZN':['amazon', 'AMZN'],
'PFE':['PFE', 'Pfzier'], 'MSFT':['Microsoft', 'MSFT'], 'NKLA':['Nikola', 'NKLA'], 'GNUS':['GNUS', 'Genius'],
'FSLY':['Flastly', 'FSLY'], 'PTON':['PTON', 'Peleton'], 'RSI':['RSI'], 'ROKU':['Roku', 'ROKU'],
'PINS':['PINS', 'Pinterest'], 'CRSR':['Corsair', 'CRSR'], 'PLUG':['Plug', 'PLUG'], 'WWR':['Westwater', 'WWR'],
'HYLN':['HYLN', 'Hyliion'], 'SNDL':['Sundial', 'SNDL'], 'JD':['JD'], 'SNAP':['snap', 'snapchat'],
'RKT':['Rocket', 'RKT'],
'ES':['Eversource', 'ES'], 'JKS':['JinkoSolar', 'JKS'],
'NNDM':['NNDM'], 'GME':['Gamestop', 'GME'], 'SE':['SE'], 'MA':['Mastercard', 'MA'], 'SQQQ':['Invesco', 'sqqq'],
'KO':['KO', 'Coke'], 'MARA':['MARA'],
'HEAR':['HEAR'], 'CRM':['Salesforce', 'CRM'], 'CGC':['Canopy', 'CGC'], 'IZEA':['IZEA'],
'CBAT':['CBAK', 'CBAT'], 'VXX':['vxx'], 'INTC':['Intel', 'intc'],
'SPCE':['virgin galactic', 'SPCE'], 'AAL':['American Airlines', 'AAL'],
'SPI':['SPI'], 'AREC':['AREC'], 'NFLX':['Netflix', 'NFLX'], 'QQQ':['Invesco', 'QQQ'],
'UONE':['UONE'], 'BLNK':['Blink', 'BLNK'],
'HD':['Home Depot', 'HD'], 'DIS':['disney', 'DIS'], 'MO':['Altria', 'MO'],
'TLRY':['Tilray', 'TLRY'], 'ADTX':['Aditxt', 'ADTX'],
'LMNL':['Liminal', 'LMNL'], 'JMIA':['Jumia', 'JMIA'], 'EXAS':['Exact Sciences', 'EXAS'],
'SOLO':['Electrameccanica', 'SOLO'], 'XERS':['Xeris', 'XERS'], 'DPW':['Deutsche Post', 'DPW'],
'GILD':['GILD', 'Gilead'], 'VLDR':['Velodyne', 'VLDR'], 'BTC': ['Bitcoin', 'BTC', 'BTC-USD']}

In [7]:
# Comments
more_terms = {k: tuple(v) for k, v in more_terms.items()}

In [8]:
# Comments
additional_terms = {}
for k, vals in more_terms.items():
    for v in vals:
        additional_terms[v.lower()] = k

In [9]:
# Comments
def list_to_symbols(list_obj, set=additional_terms):
    symbols = []
    for l in list_obj:
        if l in set.keys():
            symbols.append(additional_terms[l])
    return symbols

In [870]:
# Comments
full_comments = full_comments.assign(more_symbols = lambda x: x.cc_comment.apply(nltk.tokenize.word_tokenize).apply(lambda z: list_to_symbols(z)).apply(str))

In [871]:
# Comments
full_comments = full_comments.assign(symbols = lambda x: x.symbols.apply(lambda s: re.sub(r'[\[\]\'\']', "", s)).apply(str.upper).apply(str.strip))\
    .assign(more_symbols = lambda x: x.more_symbols.apply(lambda s: re.sub(r'[\[\]\'\']', "", s)))\
        .assign(last_symbols = lambda x: x.symbols.apply(str.upper) + ',' + x.more_symbols)\
            .assign(last_symbols = lambda x: x.last_symbols.apply(lambda sym: set(sym.strip().split(',')) ))

In [872]:
# Comments
# Additional symbols. Full list
full_comments = full_comments.assign(symbols = lambda x: x.last_symbols.apply(lambda s: ",".join([c.strip() for c in s if c != ''])))

In [873]:
# Comments
full_comments_final = full_comments.drop(['more_symbols', 'last_symbols', 'cc_comment'], axis=1)

In [914]:
# full_comments_final[lambda x: x.symbols.str.contains(',')]

In [875]:
# Comments
with sql.connect('../data/interim/companies.db') as con:
    cs = pd.read_sql(f"SELECT DATE(timestamp) date, channel, symbols, pos_sent, neu_sent, neg_sent, comp_sent from symbol_comments ORDER BY timestamp", parse_dates={'date': '%Y-%m-%d'}, con=con)

In [876]:
# Comments
# sentiment and symbols
cs

Unnamed: 0,date,channel,symbols,pos_sent,neu_sent,neg_sent,comp_sent
0,2019-08-08,wetlqd-ideas,['AMD'],0.293,0.707,0.000,0.4404
1,2019-08-13,wetlqd-ideas,['MSFT'],0.182,0.818,0.000,0.4404
2,2019-08-13,wetlqd-ideas,['MSFT'],0.294,0.706,0.000,0.3612
3,2019-08-14,wetlqd-ideas,"['ON', 'MA', 'ING']",0.000,0.597,0.403,-0.4019
4,2019-08-14,wetlqd-ideas,['CRM'],0.000,1.000,0.000,0.0000
...,...,...,...,...,...,...,...
3389,2020-12-08,option-trading,['OOOO'],0.000,1.000,0.000,0.0000
3390,2020-12-08,option-trading,['ING'],0.900,0.100,0.000,0.8402
3391,2020-12-08,trading,['FSLY'],0.000,1.000,0.000,0.0000
3392,2020-12-08,trading,['IT'],0.000,1.000,0.000,0.0000


In [885]:
# Comments
# Hash Users IDs to anon them 
id_codes = {k: hash(k) for k in full_comments_final.id.unique()}

In [891]:
# Comments
# Hash Users IDs to anon them 
id_regex = r"|".join([key for key in id_codes.keys()])

In [907]:
# Comments
def new_string(string):
    regs = re.findall(id_regex, string)
    if regs != []:
        for r in regs:
            string = string.replace(r, str(hash(r)))
    return string

In [917]:
# Comments
# Replace Mentions Column and Chat Content to Conform to new Codes
full_comments_final = full_comments.assign(content = lambda x: x.content.apply(lambda s: new_string(s))).assign(mentions=lambda x: x.mentions.apply(lambda s: new_string(s)))\
    .assign(id=lambda x: x.id.apply(lambda s: new_string(s)))

In [921]:
# Comments
full_comments_final = full_comments_final.drop(['more_symbols', 'last_symbols'], axis=1)

In [923]:
# Comments
full_comments_final = full_comments_final.drop(['cc_comment'], axis=1)

In [925]:
# Comments
full_comments_final = full_comments_final.drop(['pk'], axis=1)

In [926]:
# Comments
full_comments_final

Unnamed: 0,date,timestamp,channel,id,server,content,isBot,mentions,emojis,links,chat_emotes,pos_sent,neg_sent,neu_sent,comp_sent,symbols
0,2019-08-02,2019-08-02 01:10:52.637000+00:00,wetlqd-ideas,1767074690797870795,Misc,Play account today,0,,,,,0.545,0.0,0.455,0.3400,
1,2019-08-02,2019-08-02 01:13:47.923000+00:00,wetlqd-ideas,1767074690797870795,Misc,@everyone today’s total. This doesn’t calculat...,0,everyone,,,,0.153,0.0,0.847,0.4404,
2,2019-08-02,2019-08-02 01:14:35.798000+00:00,wetlqd-ideas,9165658630182813742,Misc,Crack,0,,,,,0.000,0.0,1.000,0.0000,
3,2019-08-02,2019-08-02 01:14:39.682000+00:00,wetlqd-ideas,9165658630182813742,Misc,Big crack,0,,,,,0.000,0.0,1.000,0.0000,
4,2019-08-02,2019-08-02 01:14:44.367000+00:00,wetlqd-ideas,9165658630182813742,Misc,Heroin,0,,,,,0.000,1.0,0.000,-0.4939,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31929,2020-12-08,2020-12-08 20:38:45.351000+00:00,trading,-4958649595996145486,LSV,<@!7285578842926376802> all of my shorter vide...,0,7285578842926376802,,,,0.082,0.0,0.918,0.3400,
31930,2020-12-08,2020-12-08 20:39:17.246000+00:00,trading,-3368183788766211362,LSV,<@-4958649595996145486> have you done a video ...,0,-4958649595996145486,,,,0.182,0.0,0.818,0.2500,
31931,2020-12-08,2020-12-08 20:39:32.831000+00:00,trading,-4958649595996145486,LSV,"<@!-3368183788766211362> nope, not yet",0,-3368183788766211362,,,,0.000,0.0,1.000,0.0000,
31932,2020-12-08,2020-12-08 20:39:38.597000+00:00,trading,-4958649595996145486,LSV,they're the airline wifi company right?,0,,,,,0.000,0.0,1.000,0.0000,


In [927]:
# Comments
full_comments_final.loc[:, ['date', 'channel', 'symbols', 'pos_sent', 'neg_sent', 'neu_sent', 'comp_sent']]

Unnamed: 0,date,channel,symbols,pos_sent,neg_sent,neu_sent,comp_sent
0,2019-08-02,wetlqd-ideas,,0.545,0.0,0.455,0.3400
1,2019-08-02,wetlqd-ideas,,0.153,0.0,0.847,0.4404
2,2019-08-02,wetlqd-ideas,,0.000,0.0,1.000,0.0000
3,2019-08-02,wetlqd-ideas,,0.000,0.0,1.000,0.0000
4,2019-08-02,wetlqd-ideas,,0.000,1.0,0.000,-0.4939
...,...,...,...,...,...,...,...
31929,2020-12-08,trading,,0.082,0.0,0.918,0.3400
31930,2020-12-08,trading,,0.182,0.0,0.818,0.2500
31931,2020-12-08,trading,,0.000,0.0,1.000,0.0000
31932,2020-12-08,trading,,0.000,0.0,1.000,0.0000


In [930]:
# Comments
with sql.connect('../data/interim/companies.db') as con:
    full_comments_final.loc[:, ['date', 'channel', 'symbols', 'pos_sent', 'neg_sent', 'neu_sent', 'comp_sent']][lambda x: x.symbols != '']\
        .to_sql('symbol_comments', con=con, index_label='pk', if_exists='replace')

In [932]:
# Comments
with sql.connect('../data/interim/discord/discord.db') as con:
    full_comments_final.to_sql('comments', con=con, index_label='pk', if_exists='replace')

In [934]:
# News Articles
# NLP
# Average Combined Sentiment
wordy_df_news.comp_sent.mean()

0.5231923759791198

In [None]:
# crpyt.db = ["pk", "article", 'headline', "comments", "date", 'link', 'symbol', 'publisher', 'pos_sent', 'neg_sent', 'neu_sent', 'comp_sent']
# in bytes
# articles table = ['pk', 'id', 'title', 'link', 'date', 'publisher', 'symbol', 'article', 'pos_sent', neg_sent', 'neu_sent', 'comp_sent']
# news_sentiment of encrypted articles = ['index', 'symbol', 'publisher', 'pos_sent', 'neu_sent', 'neg_sent', 'comp_sent']

In [938]:
# News Articles
# NLP
wordy_df_news = wordy_df_news.drop('pk', axis=1)

In [941]:
# News Articles
wordy_df_news = wordy_df_news.assign(word_obj = lambda x: x.word_obj.apply(str))

In [953]:
# News Articles
with sql.connect('../data/raw/temp2.db') as con:
    wordy_df_news.to_sql('full_articles', con=con, index=True, index_label='pk', if_exists='fail')

In [962]:
# All the Data
# Revised Tables
with sql.connect('../data/interim/companies.db') as con:
    a_articles = pd.read_sql('SELECT * FROM articles', con=con) # 
    a_info  = pd.read_sql('SELECT * FROM info', con=con) ##
    a_news_sentiment = pd.read_sql('SELECT * FROM news_sentiment', con=con) #
    a_symbol_comments = pd.read_sql('SELECT * FROM symbol_comments', con=con) ##
    a_daily = pd.read_sql('SELECT * FROM daily', con=con) ##
    a_mentions = pd.read_sql('SELECT * FROM mentions', con=con)
    a_recommendations = pd.read_sql('SELECT * FROM recommendations', con=con)

In [986]:
# Financial Data
a_daily = a_daily.assign(Date = lambda x: x.Date.apply(pd.to_datetime)).drop('pk', axis=1)

In [990]:
# Analyst Recommendations
a_recommendations = a_recommendations.assign(Date = lambda x: x.Date.apply(pd.to_datetime)).drop('pk', axis=1)

In [993]:
# Comments
a_mentions

Unnamed: 0,pk,symbol,counts
0,0,SPY,167
1,1,TSLA,166
2,2,NIO,151
3,3,AMD,133
4,4,AAPL,114
...,...,...,...
2329,2329,NDS,1
2330,2330,FRIE,1
2331,2331,SAID,1
2332,2332,BEIN,1


In [994]:
# Comments
# Company Counts
mentions.merge(new_mentions.assign(symbol=lambda x: x.symbol.apply(str.upper)), 'left', on='symbol').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72 entries, 0 to 71
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   symbol    72 non-null     object
 1   counts_x  72 non-null     int64 
 2   counts_y  72 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 2.2+ KB


In [1000]:
# Comments
# Company Counts
new_mentions = new_mentions.assign(symbol=lambda x: x.symbol.apply(str.upper))

In [1025]:
# News Articles
wordy_df_news.assign(article=lambda x: x.article.apply(bytes, encoding='utf-8').apply(cryptor.encrypt).apply(cryptor.decrypt).apply(str))[lambda x: x.article.str.contains('Mr. IPO')]

Unnamed: 0,article,headline,comments,date,link,symbol,publisher,pos_sent,neg_sent,neu_sent,comp_sent,sa,word_obj
4084,b'A University of Florida economist known as ...,Mr. IPO says biopharma IPOs prices arent r...,14,2021-01-27,https://seekingalpha.com/news/3655139-mr-ipo-s...,FB,Seeking Alpha,0.102,0.011,0.887,0.9906,1,"{'tech': 7, 'ritter': 7, 'ipo': 6, 'nasdaq': 6..."
7070,b'A University of Florida economist known as \...,âMr. IPOâ says biopharma IPOs prices arenâ...,14,2021-01-27,https://seekingalpha.com/news/3655139-mr-ipo-s...,AAPL,Seeking Alpha,0.088,0.011,0.901,0.9874,1,"{'tech': 7, 'â\x80\x9d': 6, 'nasdaq': 6, 'ipos..."
10672,b'A University of Florida economist known as ...,Mr. IPO says biopharma IPOs prices arent r...,14,2021-01-27,https://seekingalpha.com/news/3655139-mr-ipo-s...,MSFT,Seeking Alpha,0.102,0.011,0.887,0.9906,1,"{'tech': 7, 'ritter': 7, 'ipo': 6, 'nasdaq': 6..."
13186,b'A University of Florida economist known as ...,Mr. IPO says biopharma IPOs prices arent r...,14,2021-01-27,https://seekingalpha.com/news/3655139-mr-ipo-s...,ZM,Seeking Alpha,0.102,0.011,0.887,0.9906,1,"{'tech': 7, 'ritter': 7, 'ipo': 6, 'nasdaq': 6..."


In [1027]:
# Save New Tables
with sql.connect('../data/interim/temp_c.db') as con:
    a_info.drop(['pk', 'holdings'], axis=1).to_sql('info', index=True, index_label='pk', con=con)
    wordy_df_news.loc[:, ['pos_sent', 'neu_sent', 'neg_sent', 'comp_sent']].to_sql('news_sentiment', con=con, index=True, index_label='pk')
    wordy_df_news.drop(['pos_sent', 'neg_sent', 'neu_sent', 'comp_sent', 'article'], axis=1).to_sql('articles', con=con, index=True, index_label='pk')
    full_comments_final.loc[:, ['date', 'channel', 'server','symbols', 'pos_sent', 'neg_sent', 'neu_sent', 'comp_sent']][lambda x: x.symbols != ''].\
        reset_index().rename(columns={'index': 'comment_pk'}).to_sql('symbol_comments', con=con, index=True, index_label='pk')
    a_daily.to_sql('daily', con=con, index=True, index_label='pk')
    a_recommendations.to_sql('recommendations', con=con, index=True, index_label='pk')
    new_mentions.to_sql('mentions', con=con, index=True, index_label='pk')

  method=method,


In [2]:
# News Articles
with sql.connect('../data/processed/temp_c.db') as con:
    wordy_df_news = pd.read_sql('SELECT * FROM articles', con=con)

In [7]:
# News Articles
wordy_df_news = wordy_df_news.assign(comments = lambda x: x.comments.apply(float)).fillna(0).assign(comments=lambda x: x.comments.apply(int))

In [9]:
# News Articles
with sql.connect('../data/processed/temp_c.db') as con:
    wordy_df_news.to_sql('articles', con=con, index=False, if_exists='replace')

In [16]:
# News Articles
wordy_df_news = wordy_df_news.assign(date = lambda x: x.date.apply(pd.to_datetime))

In [25]:
# News Articles
wordy_df_news[lambda x: ((x.date > dt.datetime(2020, 5, 31))&(x.date <= dt.datetime(2021, 6, 30)))].groupby([pd.Grouper(key='date', freq='1M'), 'symbol']).agg({'comments': ['count', 'sum', 'mean']}).sort_index(level=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,comments,comments,comments
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum,mean
date,symbol,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2020-06-30,AAL,63,1821,28.904762
2020-07-31,AAL,79,1263,15.987342
2020-08-31,AAL,64,1494,23.343750
2020-09-30,AAL,59,726,12.305085
2020-10-31,AAL,78,2067,26.500000
...,...,...,...,...
2021-02-28,ZM,12,121,10.083333
2021-03-31,ZM,35,803,22.942857
2021-04-30,ZM,12,77,6.416667
2021-05-31,ZM,11,707,64.272727


In [57]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import json

In [22]:
# Comments
def clean_comment(comment):
    userIdCatcher = r"(<@(!)?\d+>|@everyone)"
    discord_emote_regex = r"<.+>"
    url_regex = r"https*\:.+|www\..+"
    punc_regex = r"[\’\'\'\"\"\“\”!\?@#$%&\(\)\*,-.\\\{\}+~\/:;<>\[\]^`|=_’]"
    contract_regex = r"[\'\’\’]"
    comment = re.sub("|".join([x for x in [userIdCatcher, discord_emote_regex, url_regex]]), "", comment)
    comment = re.sub(contract_regex, "", comment)
    comment = re.sub(punc_regex, " ", comment)
    comment = comment.replace("\n", " ").replace("'s", "")
    return " " + comment + " "


In [None]:
# Comments
# NLP
def wordCount(comment_array, pos):
    comment_dictionary = {}
    stop_words = set(stopwords.words('english'))
    stop_words = stop_words.union({'said', 'us', 'also', 'inc', 'could', 'word', 'b', 'q', 'https', 'com', 'lol', 'k', 'ta', 'thanks'})
    for comment in comment_array:
        punc_regex = r"[\，\’\'\'\"\"\“\”!\?@#$%&\(\)\*,-.\\\{\}+~\/:;<>\[\]^`|=_]"
        reg_bad_quotes = re.compile(u"[\x94\x93\x92\x91]")
        comment = comment.lower()
        comment = re.sub(punc_regex, " ", comment)
        comment = re.sub(reg_bad_quotes, " ", comment)
        comment = re.sub(r"[0-9]", "", comment)
        words = word_tokenize(comment)
        tags = pos_tag(words)
        for w, po in tags:
            if (w in comment_dictionary.keys()) & (w not in stop_words) & (po.startswith(pos)):
                comment_dictionary[w] += 1
            elif (w not in stop_words) & (pos.startswith(pos)):
                comment_dictionary[w] = 1
    
    comments_words = dict(sorted(comment_dictionary.items(), key=lambda x: x[1], reverse=True))
    # comment_list.append(comments_words)

    # df = df.assign(word_obj = pd.Series(comment_list))
    return pd.Series(comments_words)

In [183]:
# Comments
con = sql.connect('../data/processed/discord.db')
comments = pd.read_sql('select content, DATE(timestamp) date, comp_sent from comments', con=con, parse_dates={'timestamp': '%Y-%m-%d'})
con.close()

In [23]:
# Comments
# Gathering the Nouns and Verbs 
comment_nouns = wordCount(comments.content, 'N')

In [24]:
# Comments
comment_verbs = wordCount(comments.content, 'V')

In [25]:
# Comments
comment_nouns.head(30)

day          1024
today         782
week          712
time          652
market        639
stock         582
tomorrow      550
money         525
stocks        445
shares        374
news          373
earnings      351
people        335
everyone      332
options       324
anyone        311
term          303
gains         301
price         299
lot           298
days          282
way           261
profits       255
man           251
futures       208
morning       207
cash          195
year          191
something     181
company       178
dtype: int64

In [26]:
# Comments
comment_verbs.head(30)

going      618
sold       403
looking    400
made       282
getting    219
took       194
went       165
goes       164
trimmed    155
trying     151
done       147
reached    135
know       126
seems      121
using      108
called      92
makes       86
seen        83
saying      79
posted      78
keeping     75
gets        74
talking     72
seeing      67
based       65
wanted      58
make        57
came        57
wants       54
says        53
dtype: int64

In [31]:
# Comments
# Breakdown of Tokens by Comment Overall Sentiment 
comment_nouns_pos = wordCount(comments[lambda x: x.comp_sent > 0].content, 'N')
comment_verbs_pos = wordCount(comments[lambda x: x.comp_sent > 0].content, 'V')

In [41]:
# Comments
# Breakdown of Tokens by Negative Comments
comment_nouns_neg = wordCount(comments[lambda x: x.comp_sent < 0].content, 'N')
comment_verbs_neg = wordCount(comments[lambda x: x.comp_sent < 0].content, 'V')

In [67]:
# Comments
comment_verbs_pos.head(50)

going       252
looking     230
know        197
sold        166
made        153
reached     132
took        122
holding     109
getting      85
got          83
seems        77
trying       73
goes         72
went         71
done         66
using        55
make         50
trimmed      50
come         47
started      44
held         42
seeing       41
waiting      40
remember     39
called       38
makes        38
used         37
based        37
talking      35
want         33
saying       33
posted       32
playing      32
gets         31
played       31
learned      30
wanted       29
moving       29
found        28
came         27
says         26
looked       26
running      25
watching     24
gave         24
keeping      24
seen         24
alerted      23
take         23
wants        23
dtype: int64

In [68]:
# Comments
comment_nouns_neg.head(50)

day           148
market        136
today         135
time          122
money         111
people        111
week          101
stock          91
tomorrow       83
loss           78
stocks         72
news           69
lot            60
way            58
term           58
price          54
earnings       52
losses         45
options        41
man            39
days           38
morning        36
something      36
idea           35
cap            35
shares         34
everyone       33
bit            32
risk           32
year           31
cash           31
vix            31
thing          30
index          30
company        29
weeks          27
position       27
change         27
futures        27
election       27
gains          26
anything       26
entry          25
anyone         25
apple          24
reason         23
investment     23
someone        23
volume         22
strike         22
dtype: int64

In [113]:
# Analyst Recommendations 
con = sql.connect('../data/processed/temp_c.db')
recommendations_sent = pd.read_sql(f"SELECT DATE(Date) date, symbol, Firm, Action, new_grade, prev_grade FROM recommendations", con=con, parse_dates={'month': '%Y-%m-%d'})
port = pd.read_sql("SELECT DATE(Date) date, Open, Close, Volatility, symbol FROM daily WHERE symbol NOT IN ('IT', 'PT', 'ON', 'ING', 'VPU', 'VNQ', 'VAW', 'VGT', 'VIS', 'VHT', 'VFH', 'VDE', 'VDC', 'VCR', 'VOX')", con=con, parse_dates={'date': '%Y-%m-%d'})
con.close()

In [114]:
# Analyst Ratings
ratings_parse = {'Very Bearish': 1, 'Bearish': 2, 'Neutral': 3, 'Bullish': 4, 'Very Bullish': 5}

In [139]:
# Financial Data
# Calcualte Returns with Baseline Options (for S&P)
def calc_r(series, data, base=False):
    if base:
        symb = 'SPY'
    else:
        symb = series['symbol']
    d = data[lambda x: ((x.date >= series['date']) & (x.date <= dt.datetime(2022, 1, 31))&(x.symbol==symb))]
    if d.empty:
        return 0
    else:
        ret = (d.iloc[-1, 2] - d.iloc[0, 2]) / d.iloc[0, 2]
        return ret

In [116]:
# Analyst Ratings
analyst_returns_Series = recommendations_sent.loc[:, ['date', 'symbol']].apply(calc_r, data=port, axis=1)

In [117]:
# Analyst Ratings
analyst_returns_Series.name = "Returns_Post_Ratings"

In [118]:
# Analyst Ratings
recommendations_sent = recommendations_sent.merge(analyst_returns_Series, left_index=True, right_index=True).assign(new_grade=lambda x: x.new_grade.apply(lambda d: ratings_parse[d])).assign(prev_grade=lambda x: x.prev_grade.apply(lambda d: ratings_parse[d]))

In [123]:
# Analyst Ratings
# Financial Data
spy_returns_Series = recommendations_sent.loc[:, ['date', 'symbol']].apply(calc_r, data=port, base=True,axis=1)
spy_returns_Series.name = 'Market_Returns'

In [124]:
# Analyst Ratings

recommendations_sent = recommendations_sent.merge(spy_returns_Series, left_index=True, right_index=True)

In [125]:
# Analyst Ratings

recommendations_sent = recommendations_sent.assign(date=lambda x: x.date.apply(pd.to_datetime))
recommendations_sent.merge(analyst_returns_Series, left_index=True, right_index=True)
recs = recommendations_sent[lambda x: x.date >= dt.datetime(2017, 1, 1)]

In [128]:
# Analyst Ratings
# Creating a Valuation Mechanism to Evaluate Ratings vs. investing in the index fund on the same day of the ratings
# gives credit for high ratings matching higher returns since a given rating date
recs = recs.assign(g = lambda x: x.new_grade * x.Returns_Post_Ratings).assign(alpha = lambda x: x.Returns_Post_Ratings-x.Market_Returns)

In [136]:
# Analyst Ratings
recs

Unnamed: 0,date,symbol,Firm,Action,new_grade,prev_grade,Returns_Post_Ratings,Market_Returns,g,alpha
127,2017-01-19,TSLA,Morgan Stanley,up,4,3,13.474688,0.784487,53.898753,12.690201
128,2017-02-23,TSLA,RBC Capital,main,3,3,12.783155,0.705014,38.349464,12.078141
129,2017-02-27,TSLA,Goldman Sachs,down,1,3,13.329489,0.700196,13.329489,12.629293
130,2017-03-08,TSLA,Bernstein,init,3,3,13.292340,0.704149,39.877019,12.588191
131,2017-03-20,TSLA,Deutsche Bank,main,3,3,12.471098,0.695262,37.413294,11.775836
...,...,...,...,...,...,...,...,...,...,...
9363,2021-05-11,VLDR,Needham,main,5,5,0.000000,0.000000,0.000000,0.000000
9364,2021-07-20,VLDR,Baird,down,3,4,0.000000,0.000000,0.000000,0.000000
9365,2021-08-06,VLDR,Needham,main,5,5,0.000000,0.000000,0.000000,0.000000
9366,2021-11-05,VLDR,Needham,main,5,5,0.000000,0.000000,0.000000,0.000000


In [140]:
# Analyst Ratings
analyst_returns_Series_21 = recommendations_sent.loc[:, ['date', 'symbol']].apply(calc_r, data=port, axis=1)

In [141]:
# Analyst Ratings
analyst_returns_Series_21.name = 'Returns_Jan_2022'

In [143]:
# Analyst Ratings
# Financial Data
spy_returns_Series_22 = recommendations_sent.loc[:, ['date', 'symbol']].apply(calc_r, data=port, base=True, axis=1)
spy_returns_Series_22.name = 'Market_Returns_22'

In [144]:
# Analyst Ratings
# Merge it All Together
recs = recs.merge(analyst_returns_Series_21, left_index=True, right_index=True)\
    .merge(spy_returns_Series_22, left_index=True, right_index=True).assign(g2 = lambda x: x.new_grade * x.Returns_Jan_2022).assign(alpha2 = lambda x: x.Returns_Jan_2022-x.Market_Returns_22)

In [185]:
# Comments
# NLP
nouns = comment_nouns_pos.reset_index().rename(columns={0: 'positive_count', 'index': 'word'}).merge(comment_nouns_neg.reset_index().rename(columns={0: 'negative_count', 'index': 'word'}), on='word', how='outer').fillna(0)

In [186]:
# Comments
# NLP
verbs = comment_verbs_pos.reset_index().rename(columns={0: 'positive_count', 'index': 'word'}).merge(comment_verbs_neg.reset_index().rename(columns={0: 'negative_count', 'index': 'word'}), on='word', how='outer').fillna(0)

In [187]:
# Comments
# NLP
tokens = pd.concat([nouns.assign(total=lambda x: x.sum(axis=1)).assign(type=lambda x: 'noun'), verbs.assign(total=lambda x: x.sum(axis=1)).assign(type=lambda x: 'verb')], axis=0, ignore_index=True)

In [188]:
# Comments
# NLP
con=sql.connect('../data/processed/discord.db')
tokens.to_sql('tokens', index=True, index_label='pk', con=con, if_exists='replace')
con.close()

In [176]:
# Analyst Ratings
con=sql.connect('../data/processed/temp_c.db')
recs.to_sql('recsEvaluation', index=True, index_label='pk', con=con)
con.close()

In [180]:
# Comments
# NLP
verbs.sort_values('negative_count', ascending=False).head(20)

Unnamed: 0,word,positive_count,negative_count
1102,day,1.0,148.0
1113,market,1.0,136.0
1086,today,1.0,135.0
1172,time,1.0,122.0
1297,people,1.0,111.0
1131,money,1.0,111.0
1099,week,1.0,101.0
1262,stock,1.0,91.0
1092,tomorrow,1.0,83.0
1415,loss,1.0,78.0


In [178]:
# Comments
# NLP
# Tokens Table
tokens

Unnamed: 0,word,positive_count,negative_count,total,type
0,day,511.0,148.0,659.0,noun
1,today,387.0,135.0,522.0,noun
2,week,363.0,101.0,464.0,noun
3,shares,333.0,34.0,367.0,noun
4,time,326.0,122.0,448.0,noun
...,...,...,...,...,...
21155,martha,0.0,1.0,1.0,verb
21156,chest,0.0,1.0,1.0,verb
21157,hoe,0.0,1.0,1.0,verb
21158,wfh,0.0,1.0,1.0,verb


In [184]:
# Comments
# NLP
verbs = comment_verbs_pos.reset_index().rename(columns={0: 'positive_count', 'index': 'word'})\
    .merge(comment_verbs_neg.reset_index().rename(columns={0: 'negative_count', 'index': 'word'}), on='word', how='outer').fillna(0)

In [3]:
# Comments
with sql.connect('../data/processed/discord.db') as con:
    mentions = pd.read_sql('SELECT * FROM comments', con=con)

In [15]:
# Comments
mentions = mentions.assign(cc_comment=lambda x: x.content.apply(clean_comment).apply(str.lower))

In [16]:
# Comments
full_comments = mentions.assign(more_symbols = lambda x: x.cc_comment.apply(nltk.tokenize.word_tokenize).apply(lambda z: list_to_symbols(z)).apply(str))

In [24]:
# Comments
# Add on BTC Comment Mentions for Previously Created Tables
full_comments = full_comments[lambda x: x.more_symbols.str.contains('BTC')].loc[:, ['pk', 'date', 'channel', 'server', 'more_symbols', 'pos_sent', 'neg_sent', 'neu_sent', 'comp_sent']]\
    .assign(symbols= lambda x: x.more_symbols.apply(lambda s: re.sub(r'[\[\]\'\']', "", s)).apply(str.upper).apply(str.strip)).drop(['more_symbols'], axis=1)

In [40]:
# Comments
full_comments = full_comments.assign(symbols=lambda x: x.symbols.apply(str.split, sep=',').apply(lambda v: [a.strip() for a in v]).apply(set).apply(lambda j: ','.join(j)))

In [43]:
# Comments
full_comments = full_comments.assign(date=lambda x: x.date.apply(pd.to_datetime)).rename(columns={'pk': 'comment_pk'})

In [67]:
# Comments
fcc = full_comments.copy(deep=True)

In [113]:
# Comments
fcc.index = pd.RangeIndex(start=3852, stop=3852+134, step=1)

In [114]:
# Comments
full_comments = fcc.loc[:, ['comment_pk', 'date', 'channel', 'server', 'symbols', 'pos_sent', 'neg_sent', 'neu_sent','comp_sent']]

In [115]:
# Comments
btc_counts = {}
for string in full_comments.symbols:
    cs = string.split(',')
    for c in cs:
        if c in btc_counts.keys():
            btc_counts[c]+= 1
        else:
            btc_counts[c] =1

In [116]:
# Comments
btc_counts

{'BTC-USD': 134, 'MARA': 8, 'SQ': 3, 'AMD': 1, 'SPY': 2, 'AAPL': 1, 'HEAR': 1}

In [117]:
# Comments
# Need to Add to Mention Count Tables and Symbol Comments Tables; Do not need to re-add other tickers b/c they would be counted twice
con=sql.connect('../data/processed/temp_c.db')
m = pd.read_sql('SELECT * FROM mentions', con=con)
symc = pd.read_sql('SELECT * FROM symbol_comments', con=con)
con.close()

In [118]:
# Comments
m = pd.concat([m.drop('pk', axis=1), pd.DataFrame({'symbol': ['BTC-USD'], 'counts': [134]})], axis=0, ignore_index=True).sort_values('counts', ascending=False).reset_index(drop=True)

In [119]:
# Comments
symc = pd.concat([symc.drop('pk', axis=1), full_comments], axis=0, ignore_index=False)

In [124]:
# Comments
symc=symc.assign(date=lambda x: x.date.apply(pd.to_datetime))

In [125]:
# Save to DB
con=sql.connect('../data/processed/temp_c.db')
m.to_sql('mentions', index_label='pk', index=True, if_exists='replace', con=con)
symc.to_sql('symbol_comments', index_label='pk', index=True, if_exists='replace', con=con)
con.close()