In [None]:
# import statements
import pandas as pd
import ast
from nltk.tokenize import word_tokenize
from collections import Counter
import json

## File Overview

RQ2 involves tracking the toxicity levels of the subreddit over time.

In order to do so, we will use the [lexicon dictionaries](https://github.com/miriamfs/WebSci2019) from the paper: "Exploring Misogyny across the Manosphere in Reddit. In: WebSci '19 Proceedings of the 10th ACM Conference on Web Science" by Farrell et al. 

We will compare the comments in the collected dataset to the lexicon dictionaries. We will see how often the comments contain words that are a part of the dictionaries. This will allow us to see the impact of the quarantine on toxicity. 

We must first prepare the data.

The overall process is as follows: 

    1. Load the lexicon dictionaries
    2. Tokenize the comments
    3. Count frequency of dictionary terms in comments dataset
    4. Convert counts to a usable dataframe
    5. Merge count dataframe with comments dataset
    6. Export dataframe to be used in analysis/visualization

###  1. Load the lexicon dictionaries

In [None]:
# get dict of dictionary number: dictionary names from lexicon dictionary
file = open("Lexicon_names.txt", "r")
contents = file.read()
lexicon_names = ast.literal_eval(contents)

In [None]:
# get dict of dictionary number: words in dictionary from lexicon dictionary

file = open("Lexicon_values.txt", "r")
contents = file.read()
lexicon_values = ast.literal_eval(contents)

### 2. Tokenize the comments, 3. Count frequency of dictionary terms in comments dataset

In [None]:
# read filtered comments data
filtered_comments = pd.read_json('filtered_comments.json')

In [None]:
# the following code block gets the number of words in each reddit
# comment that also appears in each of the 9 lexicon dictionaries

counts=[]

# for each comment, tokenize to the word level
for row in filtered_comments['body']:
    cnt=Counter()
    comment = word_tokenize(row)
    
    # for each word, see if it is in each dictionary and increment the counter 
    for word in comment:
        for key, value in lexicon_values.items():
            if word in value:
                cnt[key] += 1
                
    # save the count for each row to be merged later         
    counts.append(cnt)
               

### 4. Convert counts to a usable dataframe

In [None]:
# convert the list of counts into a dataframe using json 
count_test = json.dumps(counts)
count_df = pd.read_json(count_test)

In [None]:
count_df.head()

In [None]:
# reorganize the counted data to be better merge with filtered_comments

count_df.sort_index(ascending=True, axis=1, inplace=True)
count_df.reset_index(inplace=True)

# rename the lexicon dictionary numbers to the dictionary names
count_df.rename(mapper=lexicon_names, axis=1, inplace=True)
count_df.head()

### 5. Merge count dataframe with comments dataset

In [None]:
# melt the counts df to better merge with the comments df
# longer df is better than wider df for analysis

value_columns = count_df.columns[1:]
counted = pd.melt(count_df, id_vars=['index'], value_vars=value_columns)

In [None]:
# fill null values with 0 to be used in analysis
counted.fillna({'value': 0}, inplace=True)

In [None]:
counted.groupby('variable')[['value']].sum()

In [None]:
# get filtered_comments ready for merge and then merge
filtered_comments.reset_index(inplace=True)

comments_rq2 = filtered_comments.merge(right=counted)

### 6. Export dataframe to be used in analysis/visualization

In [None]:
comments_rq2.to_json('./comments_rq2.json')