In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
from matplotlib import pylab as plt

***Load and clean stock market data***

In [None]:
prices = pd.read_csv("/Users/vincent/Development/CSS/4chanAnalysis/usa_indicators/stock_prices_latest.csv")
prices.head()

In [None]:
prices = prices.loc[:,('symbol','date','close_adjusted')]

In [None]:
prices.loc[:,"date"] = pd.to_datetime(prices.loc[:,"date"]).dt.to_period("M")

In [None]:
aapl = prices.loc[prices.loc[:,"symbol"]=="AAPL"].drop("symbol",axis=1)
msft = prices.loc[prices.loc[:,"symbol"]=="MSFT"].drop("symbol",axis=1)

calculate monthly variance and spread

In [None]:
aapl_var = aapl.groupby("date")["close_adjusted"].var()
aapl_spread = aapl.groupby("date")["close_adjusted"].max() - aapl.groupby("date")["close_adjusted"].min()
msft_var = msft.groupby("date")["close_adjusted"].var()
msft_spread = msft.groupby("date")["close_adjusted"].max() - msft.groupby("date")["close_adjusted"].min()

In [None]:
aapl_spread.head()

***Load hate counts***

In [None]:
pol_counts = ["./Counts/pol_counts_part0.csv","./Counts/pol_counts_part1.csv"]
news_counts = "./Counts/news_counts.csv"
sci_counts = "./Counts/sci_counts.csv"
adv_counts = "./Counts/adv_counts.csv"

In [None]:
pol_counts = pd.read_csv(pol_counts[0],index_col=0).append(pd.read_csv(pol_counts[1], index_col=0))
sci_counts = pd.read_csv(sci_counts, index_col=0)
news_counts = pd.read_csv(news_counts, index_col=0)
adv_counts = pd.read_csv(adv_counts, index_col=0)

In [None]:
def countWords(text):
    try:
        return min(1,len(text.split(" ")))
    except Exception:
        return 0

In [None]:
pol_counts.loc[:,"board"] = "pol"
sci_counts.loc[:,"board"] = "sci"
adv_counts.loc[:,"board"] = "adv"
news_counts.loc[:,"board"] = "news"

In [None]:
for df in [pol_counts,sci_counts,adv_counts,news_counts]:
    df.loc[:,"hate_count"] = df.loc[:,"matched_vocab"].map(countWords)
    df.loc[:,'date'] = pd.to_datetime(df.loc[:,"timestamp"], unit='s')
    df.loc[:,'month'] = pd.to_datetime(df.loc[:,"date"]).dt.to_period('M')

In [None]:
pol_hate_share_monthly = pol_counts.groupby("month").sum()["hate_count"] / pol_counts.groupby("month")["content"].count()
sci_hate_share_monthly = sci_counts.groupby("month").sum()["hate_count"] / sci_counts.groupby("month")["content"].count()
adv_hate_share_monthly = adv_counts.groupby("month").sum()["hate_count"] / adv_counts.groupby("month")["content"].count()
news_hate_share_monthly = news_counts.groupby("month").sum()["hate_count"] / news_counts.groupby("month")["content"].count()

In [None]:
hate_share_monthly = pd.DataFrame(index=sci_hate_share_monthly.index, columns=["sci","news","adv","pol"])
hate_share_monthly.loc[sci_hate_share_monthly.index, "sci"] = sci_hate_share_monthly
hate_share_monthly.loc[:,'news'] = news_hate_share_monthly
hate_share_monthly.loc[:, "adv"] = adv_hate_share_monthly
hate_share_monthly.loc[:, "pol"] = pol_hate_share_monthly

select date range of interest

In [None]:
shared_index = hate_share_monthly.join(aapl_var).dropna().index

In [None]:
shared_index

In [None]:
hate_share_monthly = hate_share_monthly.loc[shared_index]
aapl_spread = aapl_spread.loc[shared_index]
aapl_var = aapl_var.loc[shared_index]
msft_var = msft_var.loc[shared_index]
msft_spread = msft_spread.loc[shared_index]

***Standardize data***

In [None]:
def standardize(data: pd.Series) -> pd.Series:
    return (data - data.median()) / data.std()

In [None]:
hate_share_monthly = hate_share_monthly.apply(standardize, axis=0)
aapl_spread = standardize(aapl_spread)
aapl_var = standardize(aapl_var)
msft_var = standardize(msft_var)
msft_spread = standardize(msft_spread)

In [None]:
aapl = pd.DataFrame(index=aapl_spread.index)
aapl.loc[:,"var"] = aapl_var
aapl.loc[:,"spread"] = aapl_spread
msft = pd.DataFrame(index=msft_spread.index)
msft.loc[:,"var"] = msft_var
msft.loc[:,"spread"] = msft_spread

In [None]:
msft.plot.line()

***Calculate covariances***

In [None]:
msft_cov = msft.join(hate_share_monthly).cov()
aapl_cov = aapl.join(hate_share_monthly).cov()

***Plot heatmaps***

In [None]:
def heatmap(data):
    mask = np.zeros_like(data.to_numpy())
    mask[np.triu_indices_from(mask)] = True
    with sb.axes_style("white"):
        ax = sb.heatmap(data.to_numpy(), mask=mask, vmax=1.0, vmin=-1, center=0, square=True, xticklabels=data.index, yticklabels=data.index)
        plt.show()

In [None]:
heatmap(aapl_cov)

In [None]:
heatmap(msft_cov)