In [38]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np
import pandas as pd
import pickle

In [69]:
n_posts = pd.read_csv("Counts/n_posts.csv")
n_posts.month = pd.to_datetime(n_posts.month, format="%d/%m/%Y")
n_posts = n_posts.drop("Unnamed: 0", axis=1)
n_posts.set_index('month', inplace=True)
print(n_posts)

            n_posts_sci  n_posts_news  n_posts_pol  n_posts_adv
month                                                          
2010-05-01         4104             0            0            0
2010-06-01         6883             0            0            0
2010-07-01         6323             0            0            0
2010-08-01         4547             0            0            0
2010-09-01         4701             0            0            0
...                 ...           ...          ...          ...
2021-08-01         3148          2407        12439         2205
2021-09-01         3996          1370        16578         5097
2021-10-01         2994          1152        10748         5984
2021-11-01         2953          1939        15141         4096
2021-12-01          849           382         5951         1148

[140 rows x 4 columns]


In [71]:
fig = go.Figure()
normalize = True

for i, board in enumerate(['sci', 'news', 'pol', 'adv']):
    
    # load file
    if board=='pol':
        df0 = pd.read_csv("Counts/pol_counts_part0.csv")
        df1 = pd.read_csv("Counts/pol_counts_part1.csv")
        
        df = pd.concat([df0, df1], axis=0, ignore_index=True)
        
    else:
        df = pd.read_csv(f"Counts/{board}_counts.csv")
        
    # remove post if no hate words
    df = df.dropna(how='any', axis=0)
        
    # matched vocabulary into a list
    df['matched_vocab'] = df.matched_vocab.apply(lambda x: x.split(','))
        
    # add column with length of that list
    df['length'] = df.matched_vocab.str.len()
        
    # change unix timestamp to pandas readable date and add column for month
    df['date'] = pd.to_datetime(df.timestamp, unit='s')
    df['month'] = pd.to_datetime(df.date).dt.to_period('M')
        
    # group posts by month and plot histogram
    grouped = df.groupby('month')['length'].count()
    grouped = grouped.reset_index()
    
    dates = grouped['month'].dt.to_timestamp()  # from period object back to timestamp for plotly compatibility
    data = grouped['length']
    
    df_final = pd.concat([dates, data], axis=1)
    if normalize:
        df_final.set_index('month', inplace=True)
        df_final['divisor'] = n_posts[f"n_posts_{board}"]
        print(df_final)
        df_final.length /= df_final.divisor
        data = df_final.length
        print(data)
    
    # Add to plot. Hidden until selected from legend 
    fig.add_trace(go.Bar(x=dates, y=data, name=board, visible='legendonly'))
    
# Add slider to select range of x axis
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1m",
                     step="month",
                     stepmode="backward"),
                dict(count=6,
                     label="6m",
                     step="month",
                     stepmode="backward"),
                dict(count=1,
                     label="YTD",
                     step="year",
                     stepmode="todate"),
                dict(count=1,
                     label="1y",
                     step="year",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        ),
        type="date"
    ),
    yaxis_title='Hateful words count'
)

fig.show()
if normalize:
    fig.write_html("normalized_histograms.html")
else:
    fig.write_html("unnormalized_histograms.html")

            length  divisor
month                      
2010-05-01     265     4104
2010-06-01     417     6883
2010-07-01     475     6323
2010-08-01     318     4547
2010-09-01     364     4701
...            ...      ...
2021-08-01     336     3148
2021-09-01     378     3996
2021-10-01     329     2994
2021-11-01     329     2953
2021-12-01      82      849

[135 rows x 2 columns]
month
2010-05-01    0.064571
2010-06-01    0.060584
2010-07-01    0.075123
2010-08-01    0.069936
2010-09-01    0.077430
                ...   
2021-08-01    0.106734
2021-09-01    0.094595
2021-10-01    0.109886
2021-11-01    0.111412
2021-12-01    0.096584
Name: length, Length: 135, dtype: float64
            length  divisor
month                      
2015-11-01      19      203
2015-12-01      66      654
2016-01-01      85      865
2016-02-01      59      541
2016-04-01      28      285
...            ...      ...
2021-08-01     297     2407
2021-09-01     171     1370
2021-10-01     171     1152
202