In [None]:
%load_ext autoreload
%autoreload 2

In [7]:
import sys
sys.path.append('../../')

In [8]:
from glob import glob
import os
import time

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm

from util.file_util import StockTwitsFileReader

In [9]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', 150)

In [10]:
file_reader = StockTwitsFileReader()

In [11]:
date = '2019-08-09'
ticker = 'SHAK'

In [12]:
temp_df = file_reader.read_twit_file(ticker, date)

In [13]:
temp_df.head(2).T

Unnamed: 0,0,1
body,$SHAK CLDR Boys next call on Monday after the drop . One to look at anyways 👍,$SHAK new insider selling: 40987 shares. http://insiderbuyingselling.com/?t=SHAK
conversation.in_reply_to_message_id,,
conversation.parent,,
conversation.parent_message_id,,
conversation.replies,,
created_at,2019-08-10 03:46:47,2019-08-10 00:41:37
created_at_est,2019-08-09 23:46:47,2019-08-09 20:41:37
date_est,2019-08-09,2019-08-09
entities.chart.large,,
entities.chart.original,,


#### Lets Think about User information question later (for ex, is this user reliable?)
- For now, just focus on the message

In [14]:
COLS_OF_INTEREST = [
    'date_est',
    'created_at_est',
    'body',
    'symbols',
    'entities.sentiment.basic',
    'links',
]

In [15]:
sentiment_cols = [c for c in temp_df.columns if 'sentiment' in c]

In [16]:
temp_df[sentiment_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 4 columns):
entities.sentiment                                  0 non-null float64
entities.sentiment.basic                            22 non-null object
reshare_message.message.entities.sentiment          0 non-null float64
reshare_message.message.entities.sentiment.basic    0 non-null float64
dtypes: float64(3), object(1)
memory usage: 1.8+ KB


In [17]:
temp_df_new = file_reader.read_twit_file(ticker, date, cols='default')

In [18]:
temp_df_new.head()

Unnamed: 0,date_est,created_at_est,body,symbols,entities.sentiment.basic,links
0,2019-08-09,2019-08-09 23:46:47,$SHAK CLDR Boys next call on Monday after the drop . One to look at anyways 👍,"[{'id': 12378, 'symbol': 'SHAK', 'title': 'Shake Shack', 'aliases': [], 'is_following': False, 'watchlist_count': 9548}]",,
1,2019-08-09,2019-08-09 20:41:37,$SHAK new insider selling: 40987 shares. http://insiderbuyingselling.com/?t=SHAK,"[{'id': 12378, 'symbol': 'SHAK', 'title': 'Shake Shack', 'aliases': [], 'is_following': False, 'watchlist_count': 9547}]",,"[{'title': 'Tracking insider buying and selling stocks for Shake Shack Inc. ( SHAK )', 'url': 'http://insiderbuyingselling.com/?t=SHAK', 'shortene..."
2,2019-08-09,2019-08-09 20:14:08,$SHAK how is institutional ownership 106%? Educate me dont belittle me please.,"[{'id': 12378, 'symbol': 'SHAK', 'title': 'Shake Shack', 'aliases': [], 'is_following': False, 'watchlist_count': 9547}]",,
3,2019-08-09,2019-08-09 19:56:51,"$SHAK So far Zach Koff was the insider who dumped at the highest price. If they are exercising their options and dumping them immediately, why are...","[{'id': 679, 'symbol': 'SPX', 'title': 'S&P 500 Index', 'aliases': [], 'is_following': False, 'watchlist_count': 42321}, {'id': 4979, 'symbol': 'D...",Bearish,
4,2019-08-09,2019-08-09 19:26:32,$SHAK all these big dumps and price goes up? What gives?,"[{'id': 12378, 'symbol': 'SHAK', 'title': 'Shake Shack', 'aliases': [], 'is_following': False, 'watchlist_count': 9548}]",Bearish,


## Read for time range

In [None]:
ticker = 'SHAK'

start_date = '2018-01-01'
end_date = '2019-08-01'

Problem - below takes too long, I should cache it

In [None]:
%%time

data_df = file_reader.read_twit_file_in_range(ticker, start_date, end_date, cols='default')

In [None]:
data_df.shape

In [None]:
data_df['entities.sentiment.basic'].value_counts(dropna=False)

In [None]:
root_dir = file_reader.get_root_dir()
scrap_data_dir = os.path.join(root_dir, 'scrap_data')
scrap_data_dir

In [None]:
TICKER_LIST = [
    'SHAK',
    'MSFT',
    'TSLA',
    'SBUX'
]

In [None]:
for t in tqdm(TICKER_LIST):
    df = file_reader.read_twit_file_in_range(t, start_date, end_date, cols='default')
    df.to_pickle(os.path.join(scrap_data_dir, 
                              '{ticker}_{start_date}_{end_date}.pkl'.format(ticker=t,
                                                                            start_date=start_date,
                                                                            end_date=end_date)))

In [None]:
data_df.loc[data_df['entities.sentiment.basic'] == 'Bearish', 'body'].sample(10).to_frame()

In [None]:
data_df.loc[data_df['entities.sentiment.basic'] == 'Bullish', 'body'].sample(10).to_frame()

## Summarize Stats

In [None]:
cache_file_format = '{ticker}_{start_date}_{end_date}.pkl'

In [None]:
data_dict = dict()

In [None]:
for t in TICKER_LIST:
    temp = pd.read_pickle(os.path.join(scrap_data_dir, 
                                       cache_file_format.format(ticker=t,
                                                                start_date=start_date,
                                                                end_date=end_date,)))
    temp['ticker'] = t
    data_dict[t] = temp

In [None]:
all_df = pd.concat(data_dict.values())

In [None]:
n_days = all_df['date_est'].nunique()
n_days

Twit Count Stat

In [None]:
count_df = all_df.groupby('ticker')['date_est'].count().to_frame('count')
count_df['count_per_day'] = (count_df['count'] / n_days).round(2)
count_df

In [None]:
sentiment_count_df = all_df.groupby('ticker').apply(
    lambda x: x['entities.sentiment.basic'].value_counts(dropna=True)).reset_index()

sentiment_count_df.columns = ['ticker', 'sentiment_type', 'count']
sentiment_count_df = sentiment_count_df.pivot(
    index='ticker', columns='sentiment_type', values=['count'])

sentiment_count_df.columns = sentiment_count_df.columns.levels[1]
sentiment_count_df.columns.name = None
sentiment_count_df['total_sentiment_count'] = sentiment_count_df['Bearish'] + sentiment_count_df['Bullish']
sentiment_count_df

In [None]:
count_df = count_df.merge(sentiment_count_df, left_index=True, right_index=True)
count_df['pct_sentiment'] = count_df['total_sentiment_count'] / count_df['count']
count_df

### Investigate Hyperlinks

In [None]:
pd.set_option('display.max_colwidth', 500)

In [None]:
def extract_link_info(row):
    temp = row['links'][0]
    return pd.Series(temp)

In [None]:
link_df = all_df.dropna(subset=['links'])[['body', 'links']].sample(3)

In [None]:
link_df.assign(**link_df.apply(extract_link_info, axis=1))[['body', 'title', 'url', 'description']]

#### Look at Labeled Samples

In [None]:
N_SAMPLES = 4

In [None]:
bullish_dfs = []
bearish_dfs = []

In [None]:
all_bullish_df = all_df[all_df['entities.sentiment.basic'] == 'Bullish']
all_bearish_df = all_df[all_df['entities.sentiment.basic'] == 'Bearish']

In [None]:
for _, df in all_bullish_df.groupby('ticker'):
    bullish_dfs.append(df[['body', 'ticker', 'entities.sentiment.basic']].sample(N_SAMPLES))
    
for _, df in all_bearish_df.groupby('ticker'):
    bearish_dfs.append(df[['body', 'ticker', 'entities.sentiment.basic']].sample(N_SAMPLES))

In [None]:
bullish_df = pd.concat(bullish_dfs)
bearish_df = pd.concat(bearish_dfs)

In [None]:
bearish_df

#### Bullish / Bearish Counts

In [None]:
import matplotlib.dates as md
from matplotlib.dates import date2num

In [None]:
STOCK_DATA_DIR = '/Users/seung-jae_bang/Personal/Research/Stock_Sentiment/data/AlphaVantage/scrap'

In [None]:
ticker = 'MSFT'

In [None]:
stock_df = pd.read_pickle(os.path.join(STOCK_DATA_DIR,
                                       'stock_price_{}.pkl'.format(ticker)))

In [None]:
stock_df = stock_df.loc['2018-01-01':'2019-08-10', ['close']]

In [None]:
shak_df = pd.read_pickle(os.path.join(scrap_data_dir, 
                                      cache_file_format.format(ticker=ticker,
                                                               start_date=start_date,
                                                               end_date=end_date,)))

shak_df['date_est'] = pd.to_datetime(shak_df['date_est'])
shak_df = shak_df.dropna(subset=['entities.sentiment.basic'])

In [None]:
shak_df = shak_df.groupby('date_est').apply(
    lambda d: d['entities.sentiment.basic'].value_counts()).to_frame('count').reset_index()

In [None]:
shak_df = shak_df.pivot(index='date_est', columns='level_1', values='count').fillna(0)

In [None]:
shak_df['Bearish'] = shak_df['Bearish'].astype(int)
shak_df['Bullish'] = shak_df['Bullish'].astype(int)

shak_df = shak_df.sort_index()

In [None]:
shak_weely_df = shak_df.resample('W-FRI').sum()

In [None]:
num_dates = date2num(shak_weely_df.index)

In [None]:
myFmt = md.DateFormatter('%Y-%m-%d')

fig, ax = plt.subplots(figsize=(14, 7))
# shak_weely_df.plot(kind='bar', ax=ax)
b1 = ax.bar(num_dates - 2, shak_weely_df['Bullish'], label='bullish', color='g', width=2, align='center')
b2 = ax.bar(num_dates, shak_weely_df['Bearish'], label='bearish', color='orange', width=2, align='center')

# Set major x ticks on Mondays.
ax.xaxis.set_major_locator(
#     matplotlib.dates.WeekdayLocator(byweekday=matplotlib.dates.MO)
    matplotlib.dates.MonthLocator()
)
ax.xaxis.set_major_formatter(myFmt)
ax.xaxis.set_tick_params(rotation=45)

ax2 = ax.twinx()
l1 = ax2.plot(stock_df.index, stock_df['close'], linewidth=0.7, linestyle='--')

ax.legend('upper right', handles=[b1, b2])
plt.title('Weekly Bullish / Bearish Twit Counts for {}'.format(ticker))
plt.show()

In [None]:
myFmt = md.DateFormatter('%Y-%m-%d')

fig, ax = plt.subplots(figsize=(14, 7))
shak_weely_df.plot(kind='bar', ax=ax)

In [None]:
ax.xaxis.get_major_formatter()

In [None]:
shak_weely_df.plot(kind='bar')