In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append('../../..')

In [34]:
import os
import pickle

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer

from util.file_util import StockTwitsFileReader
from util.stockdata_helper import get_nday_returns_for_ticker
from nlp.twokenize import normalizeTextForTagger, tokenize
from nlp.text_processor import (
    token_is_cash_tag, token_is_punct, token_matches_ticker, twit_tokenize
)

In [5]:
start_date = '2017-01-01'
end_date = '2019-08-10'

### Index Twits by Date / Label

In [6]:
DATA_DIR = '/Users/seung-jae_bang/Personal/Research/Stock_Sentiment/data/Stocktwits/processed/daily_twit_text/'

In [7]:
RENAME_MAP = {
    'entities.sentiment.basic': 'sentiment_label',
}
SENTIMENT_LABEL_LIST = [
    'Bullish',
    'Bearish',
]

def collect_raw_twits_for_ticker(ticker,
                                 start_date,
                                 end_date, 
                                 save=True):
    file_path = os.path.join(DATA_DIR,
                             '{}_{}_{}.pkl'.format(ticker, start_date, end_date))
    
    if os.path.exists(file_path):
        processed_df = pd.read_pickle(file_path)
    else:
        stock_twits_reader = StockTwitsFileReader()
        raw_twit_df = stock_twits_reader.read_twit_file_in_range(ticker, 
                                                                 start_date, 
                                                                 end_date, 
                                                                 cols='default')
        processed_df = raw_twit_df.rename(columns=RENAME_MAP)
        processed_df = processed_df[processed_df['sentiment_label'].isin(SENTIMENT_LABEL_LIST)].copy()
    
        if save:
            print('Saving result in: {}'.format(file_path))
            processed_df.to_pickle(file_path)
    
    return processed_df

def get_twit_and_return_df(ticker, 
                           start_date, 
                           end_date):
    twit_df = collect_raw_twits_for_ticker(ticker,
                                           start_date,
                                           end_date)
    twit_df = twit_df[['date_est', 'body', 'sentiment_label']].copy()
    twit_df['date'] = pd.to_datetime(twit_df['date_est'])
    
    return_df = get_nday_returns_for_ticker(ticker, 
                                            start_date=start_date, 
                                            end_date=end_date)
    one_day_shift_return = return_df.shift(1)
    
    merged = twit_df.set_index('date').merge(one_day_shift_return,
                                             left_index=True,
                                             right_index=True,).dropna()
    merged['ticker'] = ticker
    
    return merged

In [8]:
# all tickers with large twit count

all_tickers = pickle.load(open('./ticker_at_least_10_median.pkl', 'rb'))

In [9]:
twit_dfs = []

for t in tqdm(all_tickers):
    twit_dfs.append(
        get_twit_and_return_df(t, start_date, end_date))

all_twit_df = pd.concat(twit_dfs)

100%|██████████| 32/32 [00:20<00:00,  1.11it/s]


In [1]:
# twit_dict = {}

# for t in tqdm(all_tickers):
#     twit_dict[t] = collect_raw_twits_for_ticker(t,
#                                                 start_date,
#                                                 end_date)

In [17]:
analysis_start = '2017-01-01'
analysis_end = '2017-06-30'

In [18]:
all_twit_df = all_twit_df[analysis_start:analysis_end].copy()

In [19]:
all_twit_df.sample(5)

Unnamed: 0_level_0,date_est,body,sentiment_label,adjusted close return,ticker
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-05-03,2017-05-03,$FB up a dollar,Bullish,0.002099,FB
2017-02-03,2017-02-03,$WMT stick to the cheaper retail giants.,Bullish,0.007096,WMT
2017-05-01,2017-05-01,$AMD Met exp and raised guidance; storm should...,Bullish,-0.023495,AMD
2017-04-10,2017-04-10,$GS,Bearish,-0.003324,GS
2017-03-10,2017-03-10,$AMD Here&#39;s a bit of historical data for t...,Bullish,0.008321,AMD


In [25]:
def get_tokenized_corpus(twit_df):
    tokenized_corpus = []
    twit_ticker_pairs = list(zip(twit_df['body'], twit_df['ticker']))
    
    for twit, ticker in tqdm(twit_ticker_pairs):
        tokenized = twit_tokenize(twit, ticker=ticker, normalize=True)
        tokenized_corpus.append(tokenized)
    
    return tokenized_corpus

In [26]:
tokenized_corpus = get_tokenized_corpus(all_twit_df)

100%|██████████| 354386/354386 [00:53<00:00, 6633.74it/s]


In [66]:
def dummy(doc):
    return doc

cv = CountVectorizer(
    tokenizer=dummy,
    preprocessor=dummy,
    ngram_range=(1, 3),
    min_df=50,
)  

In [67]:
count_matrix = cv.fit_transform(tokenized_corpus)

In [68]:
count_matrix.shape, len(tokenized_corpus), all_twit_df.shape

((354386, 7268), 354386, (354386, 5))

In [69]:
next_day_up_idxes = np.nonzero((all_twit_df['adjusted close return'] > 0).values)[0]
next_day_down_idxes = np.nonzero((all_twit_df['adjusted close return'] < 0).values)[0]

In [71]:
next_day_up_idxes.shape, next_day_down_idxes.shape

((193764,), (157131,))

In [72]:
up_scores = np.array(count_matrix[next_day_up_idxes].mean(axis=0)).squeeze()
down_scores = np.array(count_matrix[next_day_down_idxes].mean(axis=0)).squeeze()

In [73]:
log_odds = np.log(up_scores / down_scores)

  """Entry point for launching an IPython kernel.


In [74]:
log_odds_df = pd.DataFrame(
    {'score': log_odds,
     'vocab': cv.get_feature_names()})

In [75]:
mask = (log_odds_df['score'] != np.inf) & (log_odds_df['score'] != -np.inf)

In [76]:
log_odds_df_new = log_odds_df[mask].copy()

In [77]:
log_odds_df_new.sort_values('score', ascending=False)

Unnamed: 0,score,vocab
489,3.815791,175+ 190
488,2.896519,175+
1077,1.598728,baidu
496,1.515949,190
2504,1.502156,existing swing short
3622,1.476838,josh
741,1.388382,@amd
30,1.345069,#twtr
5129,1.339252,presentation
2290,1.306786,dropbox
