In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../../')

In [3]:
import os
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import funcy

from config import STOCKTWITS_TICKER_LIST
from util.file_util import StockTwitsFileReader
from nlp.twokenize import normalizeTextForTagger, tokenize
from nlp.text_processor import (
    token_is_cash_tag, token_is_punct, token_matches_ticker, twit_tokenize
)

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
stock_twits_reader = StockTwitsFileReader()

data_dir = os.path.join(stock_twits_reader.root_dir,
                        'processed/text_analysis')

In [5]:
twit_train_df = pd.read_pickle(os.path.join(data_dir,
                                            'train_twits.pkl'))
twit_val_df = pd.read_pickle(os.path.join(data_dir,
                                          'val_twits.pkl'))
twit_test_df = pd.read_pickle(os.path.join(data_dir,
                                          'test_twits.pkl'))

In [6]:
twit_train_df.shape, twit_val_df.shape, twit_test_df.shape

((314276, 7), (78570, 7), (98212, 7))

In [7]:
twit_train_df.head()

Unnamed: 0,date_est,created_at_est,body,symbols,entities.sentiment.basic,links,ticker
53193,2018-12-06,2018-12-06 16:53:51,$MSFT .,"[{'id': 2735, 'symbol': 'MSFT', 'title': 'Micr...",Bullish,,MSFT
610784,2019-07-11,2019-07-11 10:19:04,$TSLA $400 is coming,"[{'id': 8660, 'symbol': 'TSLA', 'title': 'Tesl...",Bullish,,TSLA
571364,2019-06-05,2019-06-05 17:49:20,$TSLA they are just so beautiful looking on th...,"[{'id': 8660, 'symbol': 'TSLA', 'title': 'Tesl...",Bullish,,TSLA
8447,2019-05-13,2019-05-13 10:20:45,$UBER I guess Saudis long algos are activated ...,"[{'id': 11554, 'symbol': 'UBER', 'title': 'Ube...",Bullish,,UBER
70872,2018-05-03,2018-05-03 09:13:28,$TSLA wow poor bulls that have been buying the...,"[{'id': 8660, 'symbol': 'TSLA', 'title': 'Tesl...",Bearish,,TSLA


In [8]:
SENTIMENT_COLUMN = 'entities.sentiment.basic'

SENTIMENT_MAP = {
    'Bullish': 1,
    'Bearish': 0,
}

In [9]:
def get_tokenized_corpus(twit_df):
    tokenized_corpus = []
    twit_ticker_pairs = list(zip(twit_df['body'], twit_df['ticker']))
    
    for twit, ticker in tqdm(twit_ticker_pairs):
        tokenized = twit_tokenize(twit, ticker=ticker, normalize=True)
        tokenized_corpus.append(tokenized)
    
    return tokenized_corpus

def extract_target_values(twit_df):
    return twit_df[SENTIMENT_COLUMN].map(SENTIMENT_MAP).values

In [10]:
tokenized_train = get_tokenized_corpus(twit_train_df)
tokenized_val = get_tokenized_corpus(twit_val_df)
tokenized_test = get_tokenized_corpus(twit_test_df)

100%|██████████| 314276/314276 [00:51<00:00, 6065.89it/s]
100%|██████████| 78570/78570 [00:12<00:00, 6329.67it/s]
100%|██████████| 98212/98212 [00:15<00:00, 6174.09it/s]


In [11]:
y_train = extract_target_values(twit_train_df)
y_val = extract_target_values(twit_val_df)
y_test = extract_target_values(twit_test_df)

In [12]:
np.bincount(y_train), np.bincount(y_val), np.bincount(y_test)

(array([117634, 196642]), array([29491, 49079]), array([36803, 61409]))

In [13]:
len(tokenized_train), twit_train_df.shape, y_train.shape

(314276, (314276, 7), (314276,))

In [14]:
len(tokenized_val), twit_val_df.shape, y_val.shape

(78570, (78570, 7), (78570,))

In [15]:
len(tokenized_test), twit_test_df.shape, y_test.shape

(98212, (98212, 7), (98212,))

In [16]:
tokenized_train[:5]

[[],
 ['$400', 'coming'],
 ['beautiful',
  'looking',
  'on',
  'local',
  'roads',
  'city',
  'seems',
  'like',
  'every',
  'house',
  'one',
  'waiting',
  'turn',
  'on',
  'model',
  'standard',
  'range',
  'fit',
  'families',
  'needs',
  'price',
  'point'],
 ['guess', 'saudis', 'long', 'algos', 'activated', '😁', '❤', '️🙏🏼🌸🍻💰🎡🥂'],
 ['wow',
  'poor',
  'bulls',
  'buying',
  'dips',
  'calls',
  'er',
  'almost',
  'got',
  'suckered',
  'buy',
  'bs',
  '200',
  'easy',
  'july']]

In [17]:
def dummy(doc):
    return doc

tfidf = TfidfVectorizer(
    tokenizer=dummy,
    preprocessor=dummy,
    ngram_range=(1, 2),
    min_df=5,
)  

In [18]:
X_train = tfidf.fit_transform(tokenized_train)
X_val = tfidf.transform(tokenized_val)
X_test = tfidf.transform(tokenized_test)

In [19]:
X_train.shape, X_val.shape, X_test.shape

((314276, 80179), (78570, 80179), (98212, 80179))

In [20]:
vocab_list = tfidf.get_feature_names()

In [21]:
vocab_to_idx = dict(zip(vocab_list, range(len(vocab_list))))
idx_to_vocab = funcy.flip(vocab_to_idx)

In [22]:
def vocab_to_feature_df(feature_vec, vocab_list):
    df = pd.DataFrame(zip(vocab_list, feature_vec), columns=['vocab', 'score']).sort_values('score', ascending=False)
    return df

In [23]:
tokenized_train[2]

['beautiful',
 'looking',
 'on',
 'local',
 'roads',
 'city',
 'seems',
 'like',
 'every',
 'house',
 'one',
 'waiting',
 'turn',
 'on',
 'model',
 'standard',
 'range',
 'fit',
 'families',
 'needs',
 'price',
 'point']

In [24]:
vocab_to_feature_df(X_train[2].toarray().flatten(), vocab_list).head(10)

Unnamed: 0,vocab,score
41483,looking on,0.279086
65411,standard range,0.263934
27442,families,0.246436
39928,like every,0.239116
55118,price point,0.234914
72944,turn on,0.232182
59170,roads,0.223286
28259,fit,0.221288
65409,standard,0.213022
50670,on model,0.209417


In [25]:
tfidf_score_df = vocab_to_feature_df(np.array(X_train.sum(axis=0)).flatten(), vocab_list)

In [26]:
tfidf_score_df.head(20)

Unnamed: 0,vocab,score
50084,on,3681.619268
73723,up,3264.188054
68596,tesla,2716.681859
62465,short,2525.798327
14901,buy,2473.448083
30789,go,2371.623439
71076,today,2371.468902
31169,going,2354.359237
48703,not,2306.600466
11130,bears,2211.345551


## Train model

### (a) Logistic Reg

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from scipy.sparse import hstack, vstack
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [28]:
lr = LogisticRegression(C=1.0, solver='lbfgs')

In [29]:
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
# y_val_pred = lr.predict(X_val)

# accuracy_score(y_val, y_val_pred), f1_score(y_val, y_val_pred)

# print(classification_report(y_val, y_val_pred))

In [31]:
y_test_pred = lr.predict(X_test)

In [32]:
accuracy_score(y_test, y_test_pred), f1_score(y_test, y_test_pred)

(0.796755997230481, 0.8468461556167663)

In [33]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.79      0.63      0.70     36803
           1       0.80      0.90      0.85     61409

    accuracy                           0.80     98212
   macro avg       0.79      0.76      0.77     98212
weighted avg       0.80      0.80      0.79     98212



In [34]:
coef_df = vocab_to_feature_df(lr.coef_.flatten(), vocab_list)

In [35]:
# coef_df.head(20)
coef_df.head(20)

Unnamed: 0,vocab,score
11130,bears,10.566667
62996,shorts,9.298578
15729,calls,5.862485
62919,shorties,4.702733
80042,🚀🚀🚀,4.697823
80029,🚀,4.696454
79624,🏎🚀,4.456905
29188,fud,4.396343
13776,breakout,4.265959
28468,fly,4.234172


In [36]:
def get_non_zero_idxes_for_vocab(X_train, vocab):
    idx = vocab_to_idx[vocab]
    nonzero_idxes = np.nonzero(X_train[:, idx].toarray().flatten())[0]
    return nonzero_idxes

In [39]:
sample_idxes = np.random.choice(get_non_zero_idxes_for_vocab(X_train, 'bulls'), 10)

In [42]:
twit_train_df.iloc[sample_idxes]['body'].values

array(['$TSLA Going to get worse bulls. It’s got to go to $180',
       '$TSLA Bulls two hole cards ...(1) Saudis; and (2) profitability in current quarter ... that could be crumbling fast!',
       '$BYND ethan and his family would like to thank you for donation to buy his new mansion!! he and his family are going to have a really wonderful Christmas this year!! thanks again bulls -',
       '$TSLA Muskie fooled the bulls but not the 🐻',
       '$TSLA bulls just get out rn. Clearly it was a bull Trap',
       '$SBUX Let the bears have one day, onward for the bulls.🚀',
       '$TSLA  Elon to Bulls: I’m inviting you to a Party at my Mansion. I invited a great COOK named Tim, and we will dine BUFFET style 😆😆😆🚀🚀',
       '$BA Don&#39;t listen to all the Pumpers, they will Kill your cash today Bulls! Sometimes you have to know when to FOLD&#39;EM!!! 🃏🀄 Don&#39;t be so shallow and not see the Price action sentiment! Big money dumping on Retail Gophers - Go for anything! this is headed where

In [None]:
X_all = vstack((X_train, X_val))
y_all = np.concatenate((y_train, y_val))

In [None]:
X_all.shape, y_all.shape

In [None]:
splits = np.zeros(y_all.shape[0], dtype=int)
splits[:X_train.shape[0]] = -1

In [None]:
splits.sum(), X_train.shape

In [None]:
ps = PredefinedSplit(splits)

In [None]:
ps.get_n_splits()

In [None]:
for train_index, test_index in ps.split():
    print("TRAIN:", train_index, "TEST:", test_index)

In [None]:
lr_param_grid = {'C': [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]}

In [None]:
grid_search = GridSearchCV(LogisticRegression(), 
                           lr_param_grid, 
                           cv=ps,)

In [None]:
grid_search.fit(X_all, y_all)

In [None]:
grid_search.best_params_

### (b) SVM

In [None]:
from sklearn.svm import SVC

In [None]:
svm_param_grid = {'C': [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]}

In [None]:
grid_search = GridSearchCV(SVC(kernel='linear'), 
                           svm_param_grid, 
                           cv=ps,)

In [None]:
grid_search.fit(X_all, y_all)

In [None]:
grid_search.best_params_