## Reddit comments
### Generate PCA based "toxicity" score

In this notebook, I create a new comment variable "pca_score" using a PCA analysis on vote score and number of replies. 

pca_score is a standardized (-5 to 5 range) score based on comment features that indicates whether the comment was positively or negatively received by other users. A highly negative comment score could be deemed "toxic".

pca_score will be used to create the toxic vs non-toxic comment label for supervised training of the toxic comment classifiers.

The comments were downloaded from a target Reddit using PRAW and a custom script. Each comment has the following associated features:

- comment ID#
- subreddit name
- post ID#
- parent ID#
- comment timestamp
- comment age since post time (secs)
- comment age since now (secs)
- user ID#
- user name
- user account created date
- user account comment karma
- user account link karma
- #replies to the comment
- contoversial flag state
- comment vote score
- comment text (converted to ascii)



### Setup notebook and load reddit comment data

In [1]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')
# ---

%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')

import pandas as pd
pd.options.display.max_columns = 100

import numpy as np
import datetime
import time
import csv
import seaborn as sns

In [9]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression

def get_best_PCA_score(df, cols2use):
    numdims = len(cols2use)-1
    pcascores = PCA(n_components=numdims).fit_transform(df[cols2use])
    bestR2 = 0
    slopesign = 0
    bestPCA = []
    X = df['score'].values.reshape(-1, 1)
    for i in range(numdims):
        y =  pcascores[:,i].reshape(-1, 1)
        reg = LinearRegression().fit(X,y)
        R2 = reg.score(X,y)
        if bestR2 < abs(R2):
            bestR2 = abs(R2)
            slopesign = np.sign(reg.coef_[0][0])
            bestPCA = y
    if slopesign < 0:
        bestscore = -bestPCA
    else:
        bestscore = bestPCA
    return bestscore


def create_comment_score(df, subname, outputpath, verbose=True):
    # get sign of vote score
    df['score_sign'] = (df.score<0).map({True:'negative',False:'positive'})

    # convert seconds time difference between post date and comment date into days
    df['u_days'] = ((df.time-df.u_created)/86400)

    # specify which feature columns to analyze
    cols2compare = ['u_comment_karma', 'u_link_karma', 'num_replies', 
                    'u_days', 'score']

    # plot freq histograms for selected features
    if verbose:
        plt.figure(figsize=(15, 8))
        for plotnum, colname in zip(range(len(cols2compare)), cols2compare):
            std3 = 3*df[colname].std()
            plt.tight_layout()
            plt.subplot(2,3,plotnum+1)
            plt.hist(df[colname], range=(-std3,std3), log=True)
            plt.title(colname)
        plt.tight_layout(rect=[0, 0, 1, 0.90]);
        plt.suptitle(subname+'- original feature histograms, log y scale', fontsize=20);
        plt.show();
    
    # log transform features
    # create a new df for all the log transformed features
    df_log = df.copy()

    # comment karma can be negative, so compute logs differently
    df_log.u_comment_karma[df_log.u_comment_karma>0] = np.log(
        df_log.u_comment_karma[df_log.u_comment_karma>0].astype(float))
    df_log.u_comment_karma[df_log.u_comment_karma<0] = -np.log(
        df_log.u_comment_karma[df_log.u_comment_karma<0].abs().astype(float))
    df_log.num_replies[df_log.num_replies>0] = np.log(
        df_log.num_replies[df_log.num_replies>0].astype(float))
    # comment score can be negative, so compute logs differently
    df_log.score[df_log.score>0] = np.log(df_log.score[df_log.score>0].astype(float))
    df_log.score[df_log.score<0] = -np.log(df_log.score[df_log.score<0].abs().astype(float))

    # make numreplies negative if vote score is negative
    df_log['num_replies_fixed'] = df_log.num_replies
    df_log.num_replies_fixed[df_log.score<0] = -df_log.num_replies_fixed[df_log.score<0]
    
    # plot correlation matrix of fixed features
    # Generate a mask for the upper triangle
    if verbose:
        mask = np.zeros([len(cols2compare),len(cols2compare)], dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True
        # Generate a custom diverging colormap
        cmap = sns.diverging_palette(220, 10, as_cmap=True)
        plt.figure(figsize=(7, 7))
        sns.heatmap(df_log[cols2compare].corr(), mask=mask, cmap=cmap, 
                    center=0, annot=True, fmt='1.3f',
                    square=True, linewidths=.5, cbar_kws={"shrink": .5})
        plt.title('Subreddit '+subname+': selected feature correlations, all samples\n');
        plt.show();

    # do PCA analysis
    cols2use = ['score', 'num_replies_fixed', 'u_comment_karma' ]
    bestscore = get_best_PCA_score(df_log, cols2use)
    
    if verbose:
        plt.figure(figsize=(5,5))
        plt.scatter(df_log['score'], bestscore,  color='blue',  s=1)
        plt.tight_layout();
        plt.xlabel('Log vote score')
        plt.ylabel('PCA2 score')
        plt.title('Vote score vs PCA2 score');    

    # create output dataframe
    df_labeled = df.copy() # copy original unmodified dataframe
    
    # using PCA 2 as score
    df_labeled['pca_score'] = bestscore

    # range and adjust scores using stds for pos and neg values
    stdneg = df_labeled.pca_score.std()
    stdpos = stdneg

    # standardize the range, treat pos and neg scores separately
    df_labeled.pca_score[df_labeled.pca_score<0] = df_labeled.pca_score[df_labeled.pca_score<0]/stdneg
    df_labeled.pca_score[df_labeled.pca_score>0] = df_labeled.pca_score[df_labeled.pca_score>0]/stdpos
    
    # threshold scores so that we have a range of +/-5
    df_labeled.pca_score[df_labeled.pca_score<-5] = -5
    df_labeled.pca_score[df_labeled.pca_score>5] = 5

    if verbose:
        plt.figure(figsize=(10, 5))
        df_labeled.pca_score.hist(bins=40)
        plt.title(subname+'- standardized and ranged PCA score distribution')
        plt.show();
        
    df_labeled.to_csv(outputpath)


In [10]:
import glob

# the subreddits I'll be analyzing
subnames = ['aww', 'funny', 'todayilearned','askreddit',
           'photography', 'gaming', 'videos', 'science',
           'politics', 'politicaldiscussion',             
           'conservative', 'the_Donald']

srcdir = './data_collected/'
destdir = './data_scored/'

for subname in subnames:
    print('\nscoring',subname)
    dfs = []
    for filename in glob.glob(srcdir+'comment_sample_'+subname+'*'):
        print('  ',filename)
        dfs.append(pd.read_csv(filename))
    df = pd.concat(dfs).drop_duplicates()
    df.dropna(inplace=True)
    # remove any deleted or removed comments 
    df = df[(df.text!='[deleted]') & (df.text!='[removed]')]
    outputpath = destdir+'comment_sample_'+subname+'_scored.csv'
    create_comment_score(df, subname, outputpath, verbose=False)
    


scoring aww
   ./data_collected\comment_sample_aww190309_213310.csv
   ./data_collected\comment_sample_aww190310_215841.csv

scoring funny
   ./data_collected\comment_sample_funny190329_191231.csv

scoring todayilearned
   ./data_collected\comment_sample_todayilearned190316_224053.csv

scoring askreddit
   ./data_collected\comment_sample_askreddit190330_083946.csv

scoring photography
   ./data_collected\comment_sample_photography190311_124954.csv
   ./data_collected\comment_sample_photography190311_171008.csv
   ./data_collected\comment_sample_photography190405_200018.csv
   ./data_collected\comment_sample_photography190405_220404.csv

scoring gaming
   ./data_collected\comment_sample_gaming190401_091528.csv
   ./data_collected\comment_sample_gaming190401_201437.csv

scoring videos
   ./data_collected\comment_sample_videos190402_074146.csv

scoring science
   ./data_collected\comment_sample_science190330_225904.csv

scoring politics
   ./data_collected\comment_sample_politics190220_0