In [29]:
import numpy as np
from glob import glob
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import re

In [25]:
np.random.seed(828)

In [None]:
TWEETS_DIRECTORY = '/home/sjb/Projects/Research/Stock_Sentiment/data/twitter/'

In [None]:
tweet_file_paths = glob(TWEETS_DIRECTORY + '*.json')

In [4]:
def read_tweet_file_path(tweet_file_path):
    cash_tag = tweet_file_path.replace('.json', '').split('_')[-1]
    df = pd.read_json(tweet_file_path, dtype=False)
    df['cash_tag'] = cash_tag
    return df

In [5]:
dfs = []

for fp in tweet_file_paths:
    dfs.append(read_tweet_file_path(fp))
    
tweet_df = pd.concat(dfs)

In [6]:
del(dfs)

In [7]:
tweet_df.shape

(545762, 16)

#### Can tweets be duplicated? - yes, just b/c I'm querying individual cash tags separately

In [8]:
duplicates = tweet_df[tweet_df['id'].duplicated(keep=False)].sort_values('id').copy()

In [9]:
duplicates.shape

(119655, 16)

In [10]:
duplicates.head(3)

Unnamed: 0,EST_date,UTC_date,author_id,favorites,geo,hashtags,id,mentions,permalink,replies,retweets,text,to,urls,username,cash_tag
203,"2019-05-27, 20:26","2019-05-28, 00:26",1018324467758465024,13,,,1133167689533276160,,https://twitter.com/EliteOptions2/status/11331...,0,2,$AMZN - Trade Idea - If 1800 breaks - 1800P --...,,,EliteOptions2,$GOOG
1168,"2019-05-27, 20:26","2019-05-28, 00:26",1018324467758465024,13,,,1133167689533276160,,https://twitter.com/EliteOptions2/status/11331...,0,2,$AMZN - Trade Idea - If 1800 breaks - 1800P --...,,,EliteOptions2,$AAPL
638,"2019-05-27, 20:26","2019-05-28, 00:26",1018324467758465024,13,,,1133167689533276160,,https://twitter.com/EliteOptions2/status/11331...,0,2,$AMZN - Trade Idea - If 1800 breaks - 1800P --...,,,EliteOptions2,$AMZN


#### Filters that I am currently trying (since there are so much noise in tweets)
- Remove duplicates
- Only select those with retweets >= 3 (no particular rationale - just that more shared tweets are likely not 'spam' tweets)
- Only select those with 1 cash_tag (so that tweet is only talking about single company)

In [11]:
tweet_df = tweet_df.drop_duplicates(subset='id')
tweet_df = tweet_df[tweet_df['retweets'] >= 3].copy()

In [12]:
reg_pattern = re.compile(r'\$[a-zA-Z]+')

In [13]:
sample_texts = tweet_df['text'].sample(5).values

In [14]:
sample_texts

array(['$ARIA $SDVI $HEMP $PHOT $CNAB $HPNN $MSFT $MFST $DGRI $EFIR $EAPH $PXYN $SANP $BTDG $BTC $CCTL $TXHD $TWLO $VALE $SING $RGBP $AUY $PTAH',
       'Which #Companies Make The Most #Revenue Per Employee? http://sumo.ly/Bx9d #ENERGY #HEALTHCARE #UTILITIES #TECH $VLO $CVX $FB. $NFLXpic.twitter.com/MlhawTfGf5',
       '"Never regret missing a trade." - @Trader_Dante #TGIF $SPY $AAPL $FB $TSLA $AMZN $NFLX $GLDpic.twitter.com/ToatT6AzOf',
       'NASDAQ JIMPS IN SHORTS: $AMD $OCLR $SBUX $MSFT $CAR $AUPH $ASNA $NFLX $YHOO $BBBY $NVDA $URBN $EBAY $JBLU $VIAB $ESRX $MYL $JBLU $ETSYpic.twitter.com/TZtH9mjgib',
       "We won't live to see another company like this. $AAPL pic.twitter.com/E3ngJA5OI9"],
      dtype=object)

In [15]:
[reg_pattern.findall(t) for t in sample_texts]

[['$ARIA',
  '$SDVI',
  '$HEMP',
  '$PHOT',
  '$CNAB',
  '$HPNN',
  '$MSFT',
  '$MFST',
  '$DGRI',
  '$EFIR',
  '$EAPH',
  '$PXYN',
  '$SANP',
  '$BTDG',
  '$BTC',
  '$CCTL',
  '$TXHD',
  '$TWLO',
  '$VALE',
  '$SING',
  '$RGBP',
  '$AUY',
  '$PTAH'],
 ['$VLO', '$CVX', '$FB', '$NFLXpic'],
 ['$SPY', '$AAPL', '$FB', '$TSLA', '$AMZN', '$NFLX', '$GLDpic'],
 ['$AMD',
  '$OCLR',
  '$SBUX',
  '$MSFT',
  '$CAR',
  '$AUPH',
  '$ASNA',
  '$NFLX',
  '$YHOO',
  '$BBBY',
  '$NVDA',
  '$URBN',
  '$EBAY',
  '$JBLU',
  '$VIAB',
  '$ESRX',
  '$MYL',
  '$JBLU',
  '$ETSYpic'],
 ['$AAPL']]

In [16]:
tweet_df['all_cashtags'] = tweet_df['text'].apply(lambda t: reg_pattern.findall(t))

In [17]:
tweet_df = tweet_df[tweet_df['all_cashtags'].str.len() == 1].copy()

In [18]:
tweet_df.sample(10)

Unnamed: 0,EST_date,UTC_date,author_id,favorites,geo,hashtags,id,mentions,permalink,replies,retweets,text,to,urls,username,cash_tag,all_cashtags
873,"2017-06-12, 06:16","2017-06-12, 10:16",397230151,14,,,874208721697730560,,https://twitter.com/OMillionaires/status/87420...,2,14,"$AMZN price target raised to $1,200 from $1,05...",,,OMillionaires,$AMZN,[$AMZN]
694,"2017-06-27, 22:31","2017-06-28, 02:31",263812431,11,,,879889947280453632,,https://twitter.com/2kaykim/status/87988994728...,1,3,"*NEW POST* ""Apple (AAPL) Analysis - PART 2"" $A...",,http://2tradersclub.com/2017/06/27/apple-aapl-...,2kaykim,$AAPL,[$AAPL]
5,"2017-04-26, 19:48","2017-04-26, 23:48",943845800,0,,,857380773090938880,,https://twitter.com/marketexclusive/status/857...,0,4,Insider Trading Activity Facebook Inc (NASDAQ:...,,https://marketexclusive.com/insider-trading-ac...,marketexclusive,$FB,[$FB]
628,"2017-04-04, 20:20","2017-04-05, 00:20",784564195987783680,3,,,849416418617696256,,https://twitter.com/Ascend_Trading/status/8494...,0,3,"Swing Trading Courses - ""Scanning and Planning...",,"http://www.ascendtrading.net/swing-trading/,ht...",Ascend_Trading,$FB,[$FB]
101,"2017-05-03, 18:26","2017-05-03, 22:26",16451932,8,,,859897026129129474,@Tim_Cook,https://twitter.com/MadMoneyOnCNBC/status/8598...,3,7,$AAPL CEO @Tim_Cook says rumors hurt iPhone sa...,,http://cnb.cx/2pJ3aNe,MadMoneyOnCNBC,$AAPL,[$AAPL]
669,"2017-07-11, 09:52","2017-07-11, 13:52",704210282126712832,4,,,884772503477944320,,https://twitter.com/xcar9x/status/884772503477...,4,6,$AMZN 1000 call 4.75$,,,xcar9x,$AMZN,[$AMZN]
552,"2017-02-28, 12:14","2017-02-28, 17:14",19546277,1,,,836625750384082944,@Pogue,https://twitter.com/YahooFinance/status/836625...,0,3,LIVE: @Pogue's best wireless earbuds - Apple A...,,http://yhoo.it/2lkJGIE,YahooFinance,$AAPL,[$AAPL]
277,"2017-02-05, 02:23","2017-02-05, 07:23",973980920,8,,#Amazon #tech #stocks #investing #innovation #...,828141913555529728,,https://twitter.com/EcommerceTop/status/828141...,0,8,Is Anyone Safe From #Amazon? ... #tech #stocks...,BColwell_Invest,https://twitter.com/BColwell_Invest/statuses/8...,EcommerceTop,$AMZN,[$amzn]
447,"2017-05-30, 11:17","2017-05-30, 15:17",20562637,14,,,869573395968602112,,https://twitter.com/businessinsider/status/869...,0,15,Amazon launches curbside pickup for groceries ...,,http://read.bi/2r7b3fu,businessinsider,$AMZN,[$AMZN]
59,"2017-06-04, 16:10","2017-06-04, 20:10",3410575617,28,,,871459134197116928,,https://twitter.com/johnscharts/status/8714591...,5,6,$AAPL tight price action hitting support on an...,,,johnscharts,$AAPL,[$AAPL]


In [19]:
tweet_df.shape

(6699, 17)

In [21]:
tweet_df.to_pickle('./tweets_sampled_0602.pkl')

In [30]:
tweet_df_train, tweet_df_test = train_test_split(tweet_df, test_size=500)

In [31]:
tweet_df_train.shape, tweet_df_test.shape

((6199, 17), (500, 17))

In [32]:
tweet_df_train.to_pickle('./tweets_sampled_train_0602.pkl')
tweet_df_test.to_pickle('./tweets_sampled_test_0602.pkl')

In [35]:
tweet_df_test[['id', 'text', 'permalink']].to_csv('./tweets_sampled_test_0602.csv', index=False)

In [37]:
tweet_df_train[['id', 'text', 'permalink']].to_csv('./tweets_sampled_train_0602.csv', index=False)