In [12]:
import pandas as pd
import numpy as np

# Load data

In [13]:
local_dir = 'input_data/saved_data_tables'

In [14]:
author_data = pd.read_csv(f'{local_dir}/author_data.csv', index_col=None, header=0)
comment_data = pd.read_csv(f'{local_dir}/comment_data.csv', index_col=None, header=0)
submission_data = pd.read_csv(f'{local_dir}/submission_data.csv', index_col=None, header=0)
text_data = pd.read_csv(f'{local_dir}/text_data.csv', index_col=None, header=0)
sentiment_data = pd.read_csv(f'{local_dir}/sentiment_data.csv', index_col=None, header=0)
ticker_data = pd.read_csv(f'{local_dir}/ticker_data.csv', index_col=None, header=0)
submission_comment_data = pd.read_csv(f'{local_dir}/submission_comment_data.csv', index_col=None, header=0)
flatten_data = pd.read_csv(f'{local_dir}/flatten_data.csv', index_col=None, header=0)

sentiment_price_data = pd.read_csv(f'{local_dir}/sentiment_price_data.csv', index_col=None, header=0)


author_data['created_utc'] = pd.to_datetime(author_data['created_utc'], format='%Y-%m-%d %H:%M:%S')
author_data['update_dt'] = pd.to_datetime(author_data['update_dt'], format='%Y-%m-%d %H:%M:%S')

comment_data['created_utc'] = pd.to_datetime(comment_data['created_utc'], format='%Y-%m-%d %H:%M:%S')
comment_data['update_dt'] = pd.to_datetime(comment_data['update_dt'], format='%Y-%m-%d %H:%M:%S')

submission_data['created_utc'] = pd.to_datetime(submission_data['created_utc'], format='%Y-%m-%d %H:%M:%S')
submission_data['update_dt'] = pd.to_datetime(submission_data['update_dt'], format='%Y-%m-%d %H:%M:%S')

text_data['update_dt'] = pd.to_datetime(text_data['update_dt'], format='%Y-%m-%d %H:%M:%S')

submission_comment_data['created_utc'] = pd.to_datetime(submission_comment_data['created_utc'], format='%Y-%m-%d %H:%M:%S')
submission_comment_data['update_dt'] = pd.to_datetime(submission_comment_data['update_dt'], format='%Y-%m-%d %H:%M:%S')

flatten_data['created_utc'] = pd.to_datetime(flatten_data['created_utc'], format='%Y-%m-%d %H:%M:%S')
flatten_data['update_dt'] = pd.to_datetime(flatten_data['update_dt'], format='%Y-%m-%d %H:%M:%S')

sentiment_price_data['Date'] = pd.to_datetime(sentiment_price_data['Date'], format='%Y-%m-%d %H:%M:%S')
sentiment_price_data['Date'] = sentiment_price_data['Date'].dt.tz_convert('America/New_York')


# Aggregate Sentiment Data with Ticker Data
# Fill in latest Date Available for Data Below

In [15]:
import datetime as dt
latest_date_file_available = dt.datetime(2021, 12, 1)

In [16]:
def get_agg_ticker_sentiment_top_N(df_source, submission_type = 'all', score_min = 0, date_window = 'all', top_records = 10):
    filter_type = ['submission', 'comment']
    if submission_type == 'submission':
        filter_type = ['submission']
    elif submission_type == 'comment':
        filter_type = ['comment']
    
    filter_date = latest_date_file_available-dt.timedelta(days=365) # latest file date available
    if date_window == '6h':
        filter_date = latest_date_file_available-dt.timedelta(hours=6)
    elif date_window == '12h':
        filter_date = latest_date_file_available-dt.timedelta(hours=12)
    elif date_window == '24h':
        filter_date = latest_date_file_available-dt.timedelta(hours=24)
    elif date_window == '48h':
        filter_date = latest_date_file_available-dt.timedelta(hours=48)
    elif date_window == '60h':
        filter_date = latest_date_file_available-dt.timedelta(hours=48)
#     print(filter_date)
    df_filtered = df_source[(df_source['type'].isin(filter_type))& (df_source['score'] >= score_min) & (df_source['created_utc'] >= filter_date)]
    
    
    df_agg_ticker = df_filtered[['ticker','sentiment']].groupby(['ticker']).count().reset_index()
    df_agg_ticker.rename(columns={'sentiment': 'mentions'}, inplace = True)

    df_agg_ticker_sentiment = df_filtered.groupby(['ticker','sentiment'],as_index=False)['id'].count()
    df_agg_ticker_sentiment.rename(columns={'id': 'mentions'}, inplace = True)
    
    df_agg_ticker_sentiment = df_agg_ticker_sentiment.pivot(index="ticker", columns="sentiment", values="mentions")
    cols = df_agg_ticker_sentiment.columns.union(['negative', 'positive', 'neutral'], sort=False)
    df_agg_ticker_sentiment = df_agg_ticker_sentiment.reindex(cols, axis=1, fill_value=0)
    df_agg_ticker_sentiment = df_agg_ticker_sentiment.replace(np.nan, 0)
    df_agg_ticker_sentiment.reset_index(inplace = True)
    
    df_agg_ticker_sentiment = df_agg_ticker_sentiment.merge(df_agg_ticker, on=['ticker'])

    df_agg_ticker_sentiment.sort_values(by = ['mentions'], ascending = False, inplace = True)

    df_agg_ticker_sentiment_top_N = df_agg_ticker_sentiment.head(top_records)
    
    return df_agg_ticker_sentiment_top_N


#  

In [17]:
df_agg_ticker_sentiment_top_N = get_agg_ticker_sentiment_top_N(flatten_data, submission_type = 'comment', score_min = 0,  date_window = '48h', top_records = 3)
df_agg_ticker_sentiment_top_N.head()


Unnamed: 0,ticker,negative,neutral,positive,mentions
1,AAPL,9.0,21.0,11.0,41
58,CRM,8.0,18.0,9.0,35
13,AMD,7.0,17.0,11.0,35


In [18]:
tickers_top_N = list(df_agg_ticker_sentiment_top_N.ticker)
tickers_top_N

['AAPL', 'CRM', 'AMD']

# Build Bar Plot with Filters

In [20]:
from ipywidgets import interact, interactive
import ipywidgets as widgets
import numpy as np

from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, FactorRange

output_notebook()

In [21]:
def get_dataset(src, submission_type = 'all', score_min = 0, date_window = 'all', top_records = 10):
    
    df_agg_ticker_sentiment_top_N = get_agg_ticker_sentiment_top_N(src, submission_type, score_min,  date_window , top_records)
    

    data = df_agg_ticker_sentiment_top_N[['ticker', 'positive', 'negative', 'neutral']].to_dict("list")
    
    return ColumnDataSource(data=data)
    

In [22]:
def make_plot(source):
    
    data = source.data

    tickers = data['ticker'] 
    sentiments = ['positive', 'negative', 'neutral']
    colors = ["#718dbf", "#e84d60", "#c9d9d3"]

    p = figure(x_range=FactorRange(), width=600,height=400, title="Mentions by sentiment",
               toolbar_location=None, tools="")

    p.vbar_stack(sentiments, x='ticker', width=0.9, color=colors, source=source, legend_label=sentiments)
    
    p.x_range.factors = tickers
    
    p.y_range.start = 0
    p.x_range.range_padding = 0.1
    p.xgrid.grid_line_color = None
    p.axis.minor_tick_line_color = None
    p.outline_line_color = None
    p.legend.location = "top_right"
    p.legend.orientation = "horizontal"
    
    return p

In [23]:
source = get_dataset(flatten_data)
plot = make_plot(source)

In [24]:
def update_plot(submission_type, date_window, score_min, top_records):
    src = get_dataset(flatten_data, submission_type, score_min,  date_window , top_records)
    
    data = src.data
    tickers = data['ticker']
    
    plot.x_range.factors = tickers
    source.data.update(src.data)
    push_notebook()


In [25]:
from ipywidgets import Layout, Button, Box, FloatText, Textarea, Dropdown, Label, IntSlider

form_item_layout = Layout(
    display='flex',
    flex_flow='row',
    justify_content='space-between'
)

submission_type_options = ['all', 'submission', 'comment']
date_window_options = ['all', '6h', '12h', '24h', '48h', '60h']

w_s_t = Dropdown(options=submission_type_options)
w_d_w =  Dropdown(options=date_window_options)
w_s_m =  IntSlider(min=0, max=5)
w_t_r = IntSlider(min=1, max=30)
form_items = [
    Box([Label(value='Submission Type'), w_s_t], layout=form_item_layout),
    Box([Label(value='Date Window'), w_d_w], layout=form_item_layout),
    Box([Label(value='Content Score Min'), w_s_m], layout=form_item_layout),
    Box([Label(value='Top Most Mentions Tickers '), w_t_r], layout=form_item_layout)
]

form = Box(form_items, layout=Layout(
    display='flex',
    flex_flow='column',
    border='solid 2px',
    align_items='stretch',
    width='50%'
))

show(plot, notebook_handle=True)
out = widgets.interactive_output(update_plot, {'submission_type': w_s_t, 'date_window': w_d_w, 'score_min': w_s_m, 'top_records': w_t_r})

display(form, out)

Box(children=(Box(children=(Label(value='Submission Type'), Dropdown(options=('all', 'submission', 'comment'),…

Output()