In [76]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np
import pandas as pd
import pickle
import json
import itertools

In [2]:
# load dataframe with total number of posts per month, per board
n_posts = pd.read_csv("Counts/n_posts.csv")
n_posts.month = pd.to_datetime(n_posts.month, format="%d/%m/%Y")
n_posts = n_posts.drop("Unnamed: 0", axis=1)
n_posts.set_index('month', inplace=True)
print(n_posts)

            n_posts_sci  n_posts_news  n_posts_pol  n_posts_adv
month                                                          
2010-05-01         4104             0            0            0
2010-06-01         6883             0            0            0
2010-07-01         6323             0            0            0
2010-08-01         4547             0            0            0
2010-09-01         4701             0            0            0
...                 ...           ...          ...          ...
2021-08-01         3148          2407        12439         2205
2021-09-01         3996          1370        16578         5097
2021-10-01         2994          1152        10748         5984
2021-11-01         2953          1939        15141         4096
2021-12-01          849           382         5951         1148

[140 rows x 4 columns]


In [111]:
# load Hatebase.org dictionary
with open('Data/hate_vocabulary.json', 'r') as f:
    hate_dict = json.load(f)
    
hate_dict = list(itertools.chain(*hate_dict))

def category(hate_term):
    '''searches term in dictionary and returns category'''
    
    #search in list to obtain dictionary of term
    if hate_term == 'sol':
        hate_term= 'sole'
    if hate_term == 'ar':
        hate_term = 'ars'
    if hate_term== 'dome':
        hate_term = 'domes'
    if hate_term=='chimpout':
        hate_term='chimp-out'
    if hate_term=='chimpouts':
        hate_term='chimp-outs'
    if hate_term=='beany':
        hate_term='beaney'
    if hate_term=='moxie':
        hate_term='moxies'
    if hate_term=='cooly':
        hate_term='coolies'
        
        
    x = next(item for item in hate_dict if item["term"].lower() == hate_term)
    
    if x['is_about_nationality']:
        category = 'Nationality'
    elif x['is_about_ethnicity']:
        category = 'Ethnicity'
    elif x['is_about_religion']:
        category = "Religion"
    elif x['is_about_gender']:
        category = "Gender"
    elif x['is_about_sexual_orientation']:
        category = "Sexuality"
    elif ['is_about_disability']:
        category = "Disability"
    elif ['is_about_class']:
        category = "Class"
    else:
        category = "Other"
        
    return category
    

In [None]:
fig = go.Figure()
fig2 = go.Figure()
normalize = True

for i, board in enumerate(['sci', 'news', 'pol', 'adv']):
    
    # load file
    if board=='pol':
        df0 = pd.read_csv("Counts/pol_counts_part0.csv")
        df1 = pd.read_csv("Counts/pol_counts_part1.csv")
        
        df = pd.concat([df0, df1], axis=0, ignore_index=True)
        
    else:
        df = pd.read_csv(f"Counts/{board}_counts.csv")
        
    # remove unnecessary column
    df = df.drop("content", axis=1)
    
    # remove post if no hate words
    df = df.dropna(how='any', axis=0)
        
    # change unix timestamp to pandas readable date and add column for month
    df['date'] = pd.to_datetime(df.timestamp, unit='s')
    df['month'] = pd.to_datetime(df.date).dt.to_period('M')
        
    # group posts by month and plot histogram
    grouped = df.groupby('month')['id'].count()
    grouped = grouped.reset_index()
    
    dates = grouped['month'].dt.to_timestamp()  # from period object back to timestamp for plotly compatibility
    data = grouped['id']
    
    # if we want percentage, compute and update 'data'
    df_final = pd.concat([dates, data], axis=1)
    if normalize:
        df_final.set_index('month', inplace=True)
        df_final['divisor'] = n_posts[f"n_posts_{board}"]
        df_final.id /= df_final.divisor
        data = df_final.id * 100
        
    # Add to figure. Hidden until selected from legend 
    fig.add_trace(go.Bar(x=dates, y=data, name=board, visible='legendonly'))
    
    
    ### For second plot, we look into the category of each slur
    # matched vocabulary into a list, and split list into different rows
    df['matched_vocab'] = df.matched_vocab.apply(lambda x: list(x.split(' ')))
    df = df.explode('matched_vocab')

    # add column for category of hate speech
    df['category'] = df.matched_vocab.apply(lambda x: category(x))
    df['dates'] = df.month.dt.to_timestamp()
    
    # plot histogram
    fig2 = px.histogram(df, x='dates', color='category', labels={'dates': 'Time', 'category': 'Category'}, title=f'{board}')
    fig2.update_layout(title_x=0.5)
    fig2.show()
    fig2.write_html(f"{board}_in_categories.html")
    
# Add slider to select range of x axis
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1m",
                     step="month",
                     stepmode="backward"),
                dict(count=6,
                     label="6m",
                     step="month",
                     stepmode="backward"),
                dict(count=1,
                     label="YTD",
                     step="year",
                     stepmode="todate"),
                dict(count=1,
                     label="1y",
                     step="year",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
)

if normalize:
    fig.update_layout(yaxis_title='Percentage of hateful posts')
    fig.show()
    fig.write_html("normalized_histograms.html")
else:
    fig.update_layout(yaxis_title='Number hateful posts')
    fig.show()
    fig.write_html("unnormalized_histograms.html")
