In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('../../../')

In [24]:
import os
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from config import STOCKTWITS_TICKER_LIST
from util.file_util import StockTwitsFileReader

In [None]:
np.random.seed(828)

In [None]:
stock_twits_reader = StockTwitsFileReader()

In [23]:
stock_twits_reader.root_dir

'/Users/seung-jae_bang/Personal/Research/Stock_Sentiment/data/Stocktwits'

In [None]:
start_date = '2018-01-01'
end_date = '2019-08-10'

In [None]:
SENTIMENT_LABEL_COLUMN = 'entities.sentiment.basic'

In [None]:
labeled_twits = []

for ticker in STOCKTWITS_TICKER_LIST:
    tic = time.time()
    
    twits_df = stock_twits_reader.read_twit_file_in_range(ticker, start_date, end_date, cols='default')
    labeled_twit_df = twits_df.dropna(subset=[SENTIMENT_LABEL_COLUMN]).copy()
    labeled_twit_df['ticker'] = ticker
    labeled_twits.append(labeled_twit_df)
    
    toc = time.time()
    print('Running for {} took {} mins'.format(ticker, (toc - tic) / 60))

In [10]:
labeled_twits_df = pd.concat(labeled_twits)

In [14]:
labeled_twits_df['ticker'].value_counts().head()

TSLA    331586
BYND     55759
MSFT     36020
LYFT     17544
SBUX      9766
Name: ticker, dtype: int64

In [15]:
labeled_twits_df.shape

(491058, 7)

In [18]:
labeled_twits_df[SENTIMENT_LABEL_COLUMN].value_counts()

Bullish    307130
Bearish    183928
Name: entities.sentiment.basic, dtype: int64

In [19]:
train_twits, test_twits = train_test_split(labeled_twits_df, test_size=0.2)

In [20]:
train_twits.shape, test_twits.shape

((392846, 7), (98212, 7))

In [21]:
train_twits, val_twits = train_test_split(train_twits, test_size=0.2)

In [22]:
train_twits.shape, val_twits.shape

((314276, 7), (78570, 7))

In [25]:
save_dir = os.path.join(stock_twits_reader.root_dir,
                        'processed/text_analysis')
save_dir

'/Users/seung-jae_bang/Personal/Research/Stock_Sentiment/data/Stocktwits/processed/text_analysis'

In [28]:
train_twits.head()

Unnamed: 0,date_est,created_at_est,body,symbols,entities.sentiment.basic,links,ticker
53193,2018-12-06,2018-12-06 16:53:51,$MSFT .,"[{'id': 2735, 'symbol': 'MSFT', 'title': 'Micr...",Bullish,,MSFT
610784,2019-07-11,2019-07-11 10:19:04,$TSLA $400 is coming,"[{'id': 8660, 'symbol': 'TSLA', 'title': 'Tesl...",Bullish,,TSLA
571364,2019-06-05,2019-06-05 17:49:20,$TSLA they are just so beautiful looking on th...,"[{'id': 8660, 'symbol': 'TSLA', 'title': 'Tesl...",Bullish,,TSLA
8447,2019-05-13,2019-05-13 10:20:45,$UBER I guess Saudis long algos are activated ...,"[{'id': 11554, 'symbol': 'UBER', 'title': 'Ube...",Bullish,,UBER
70872,2018-05-03,2018-05-03 09:13:28,$TSLA wow poor bulls that have been buying the...,"[{'id': 8660, 'symbol': 'TSLA', 'title': 'Tesl...",Bearish,,TSLA


In [27]:
train_twits.to_pickle(os.path.join(save_dir, 'train_twits.pkl'))
val_twits.to_pickle(os.path.join(save_dir, 'val_twits.pkl'))
test_twits.to_pickle(os.path.join(save_dir, 'test_twits.pkl'))