In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
from matplotlib import pylab as plt

***Load and filter weather data***

In [None]:
weather_denver = pd.read_csv("/Users/vincent/Development/CSS/4chanAnalysis/usa_indicators/weather_denver.csv")
weather_san_fran = pd.read_csv("/Users/vincent/Development/CSS/4chanAnalysis/usa_indicators/weather_san_fran.csv")
weather_washington = pd.read_csv("/Users/vincent/Development/CSS/4chanAnalysis/usa_indicators/weather_washington.csv")

In [None]:
weather = weather_denver.append(weather_washington).append(weather_san_fran).dropna(axis=1, how='all')

In [None]:
weather.head()

In [None]:
weather.columns

In [None]:
weather = weather.drop(['STATION','LATITUDE', 'LONGITUDE', 'ELEVATION'], axis=1).drop(list(filter(lambda x: 'ATTRIBUTES' in x, weather.columns)), axis=1)

In [None]:
weather.columns

In [None]:
temperatures = weather.loc[:,("DATE","NAME","TAVG","TMIN","TMAX")]
sunshine = weather.loc[:,("DATE","NAME","TSUN","PSUN")]

In [None]:
temperatures.loc[:,"DATE"] = pd.to_datetime(temperatures.loc[:,"DATE"]).dt.to_period("M")
sunshine.loc[:,"DATE"] = pd.to_datetime(sunshine.loc[:,"DATE"]).dt.to_period("M")
temperatures.loc[:,("TAVG","TMIN","TMAX")] = temperatures.loc[:,("TAVG","TMIN","TMAX")].apply(np.float64, axis=0)
sunshine.loc[:,("TSUN","PSUN")] = sunshine.loc[:,("TSUN","PSUN")]

In [None]:
temperatures.head()

Create averages

In [None]:
temperatures_avg = temperatures.groupby("DATE").mean()
temperatures_avg.head()

In [None]:
sunshine.loc[:,"NAME"].unique()

seperate by region

In [None]:
temperatures_denv = temperatures.loc[temperatures.loc[:,'NAME']=='DENVER CENTRAL PARK, CO US'].drop("NAME",axis=1)
temperatures_wash = temperatures.loc[temperatures.loc[:,'NAME']=='WASHINGTON REAGAN NATIONAL AIRPORT, VA US'].drop("NAME",axis=1)
temperatures_sanfran = temperatures.loc[temperatures.loc[:,'NAME']=='SAN FRANCISCO INTERNATIONAL AIRPORT, CA US'].drop("NAME",axis=1)
sunshine_denv = sunshine.loc[sunshine.loc[:,'NAME']=='DENVER CENTRAL PARK, CO US'].drop("NAME",axis=1)
sunshine_wash = sunshine.loc[sunshine.loc[:,'NAME']=='WASHINGTON REAGAN NATIONAL AIRPORT, VA US'].drop("NAME",axis=1)
sunshine_sanfran = sunshine.loc[sunshine.loc[:,'NAME']=='SAN FRANCISCO INTERNATIONAL AIRPORT, CA US'].drop("NAME",axis=1)

In [None]:
sunshine_denv.head()

In [None]:
temperatures_denv.index = temperatures_denv.loc[:,"DATE"]
temperatures_denv = temperatures_denv.drop("DATE",axis=1)
temperatures_wash.index = temperatures_wash.loc[:,"DATE"]
temperatures_wash = temperatures_wash.drop("DATE",axis=1)
temperatures_sanfran.index = temperatures_sanfran.loc[:,"DATE"]
temperatures_sanfran = temperatures_sanfran.drop("DATE",axis=1)
sunshine_denv.index = sunshine_denv.loc[:,"DATE"]
sunshine_denv = sunshine_denv.drop("DATE",axis=1)
sunshine_wash.index = sunshine_wash.loc[:,"DATE"]
sunshine_wash = sunshine_wash.drop("DATE",axis=1)
sunshine_sanfran.index = sunshine_sanfran.loc[:,"DATE"]
sunshine_sanfran = sunshine_sanfran.drop("DATE",axis=1)


In [None]:
sunshine_denv.head()

In [None]:
sunshine = {"denv" : sunshine_denv, "sanfran" : sunshine_sanfran, "wash" : sunshine_wash}
temperatures = {"denv" : temperatures_denv, "sanfran" : temperatures_sanfran, "wash" : temperatures_wash}

***Import counts***

In [None]:
pol_counts = ["./Counts/pol_counts_part0.csv","./Counts/pol_counts_part1.csv"]
news_counts = "./Counts/news_counts.csv"
sci_counts = "./Counts/sci_counts.csv"
adv_counts = "./Counts/adv_counts.csv"

In [None]:
pol_counts = pd.read_csv(pol_counts[0],index_col=0).append(pd.read_csv(pol_counts[1], index_col=0))
sci_counts = pd.read_csv(sci_counts, index_col=0)
news_counts = pd.read_csv(news_counts, index_col=0)
adv_counts = pd.read_csv(adv_counts, index_col=0)

In [None]:
def countWords(text):
    try:
        return min(1,len(text.split(" ")))
    except Exception:
        return 0

In [None]:
pol_counts.loc[:,"board"] = "pol"
sci_counts.loc[:,"board"] = "sci"
adv_counts.loc[:,"board"] = "adv"
news_counts.loc[:,"board"] = "news"

In [None]:
for df in [pol_counts,sci_counts,adv_counts,news_counts]:
    df.loc[:,"hate_count"] = df.loc[:,"matched_vocab"].map(countWords)
    df.loc[:,'date'] = pd.to_datetime(df.loc[:,"timestamp"], unit='s')
    df.loc[:,'month'] = pd.to_datetime(df.loc[:,"date"]).dt.to_period('M')

In [None]:
pol_hate_share_monthly = pol_counts.groupby("month").sum()["hate_count"] / pol_counts.groupby("month")["content"].count()
sci_hate_share_monthly = sci_counts.groupby("month").sum()["hate_count"] / sci_counts.groupby("month")["content"].count()
adv_hate_share_monthly = adv_counts.groupby("month").sum()["hate_count"] / adv_counts.groupby("month")["content"].count()
news_hate_share_monthly = news_counts.groupby("month").sum()["hate_count"] / news_counts.groupby("month")["content"].count()

In [None]:
hate_share_monthly = pd.DataFrame(index=sci_hate_share_monthly.index, columns=["sci","news","adv","pol"])
hate_share_monthly.loc[sci_hate_share_monthly.index, "sci"] = sci_hate_share_monthly
hate_share_monthly.loc[:,'news'] = news_hate_share_monthly
hate_share_monthly.loc[:, "adv"] = adv_hate_share_monthly
hate_share_monthly.loc[:, "pol"] = pol_hate_share_monthly

select date range of interest

In [None]:
for k, temp in temperatures.items():
    temperatures[k] = temp.loc[hate_share_monthly.index].dropna()

In [None]:
temperatures_avg = temperatures_avg.loc[hate_share_monthly.index]

In [None]:
temperatures["denv"].count()

***Standardize data***

In [None]:
def standardize(data: pd.Series) -> pd.Series:
    return (data - data.median()) / data.std()

In [None]:
hate_share_monthly = hate_share_monthly.apply(standardize, axis=0)
for k, temp in temperatures.items():
    temperatures[k] = temp.apply(standardize, axis=0)
temperatures_avg = temperatures_avg.apply(standardize)

***Plot temperatures***

In [None]:
temperatures["denv"].plot.line()

***Calculate covariances***

In [None]:
temp_denv_cov = temperatures["denv"].join(hate_share_monthly).cov()
temp_wash_cov = temperatures["wash"].join(hate_share_monthly).cov()
temp_sanfran_cov = temperatures["sanfran"].join(hate_share_monthly).cov()

In [None]:
temperatures_avg_cov = temperatures_avg.join(hate_share_monthly).cov()

***Plot covariances***

In [None]:
def heatmap(data):
    mask = np.zeros_like(data.to_numpy())
    mask[np.triu_indices_from(mask)] = True
    with sb.axes_style("white"):
        ax = sb.heatmap(data.to_numpy(), mask=mask, vmax=1.0, vmin=-1, center=0, square=True, xticklabels=data.index, yticklabels=data.index)
        plt.show()

In [None]:
heatmap(temp_denv_cov)

In [None]:
temperatures_avg.plot.line()

In [None]:
heatmap(temperatures_avg_cov)