Loading the training data might take some time. (We can load a select number of rows for quick analysis)

In [1]:
import pandas as pd
train = pd.read_csv('https://media.githubusercontent.com/media/jsakhnin/JigsawNLP_data/master/train.csv' )
train.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47


I really like the way this notebook does the data analysis and data processing
https://www.kaggle.com/nz0722/simple-eda-text-preprocessing-jigsaw.

Importing libraries we need:

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

## New Features

We add new features to the dataset based on existing features (e.g. length of comment, number of capital letters, number of exclamation marks, etc). Some of this data could be correlated with toxicity.

In [None]:
#Adding length of comment as a column
train['total_length'] = train['comment_text'].apply(len)
#Adding number of capital letters as a column
train['capitals'] = train['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
#Adding ratio of capital letters vs length of comment
train['caps_vs_length'] = train.apply(lambda row: float(row['capitals'])/float(row['total_length']),axis=1)

#Adding number of exclamation marks
train['num_exclamation_marks'] = train['comment_text'].apply(lambda comment: comment.count('!'))
#Adding number of question marks
train['num_question_marks'] = train['comment_text'].apply(lambda comment: comment.count('?'))
#Number of punctuations
train['num_punctuation'] = train['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '.,;:'))
#Number of symbols (*$%&)
train['num_symbols'] = train['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '*&$%'))

#Number of works in each comment
train['num_words'] = train['comment_text'].apply(lambda comment: len(comment.split()))
#Number of unique words
train['num_unique_words'] = train['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
#Percentage of unique words in a comment
train['words_vs_unique'] = train['num_unique_words'] / train['num_words']

#Number of smilie faces in the comment
train['num_smilies'] = train['comment_text'].apply(lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))

We can measure the correlation of our newly created features with some of the original columns of the data.

In [None]:
features = ('total_length', 'capitals', 'caps_vs_length', 'num_exclamation_marks','num_question_marks',
            'num_punctuation', 'num_words', 'num_unique_words','words_vs_unique', 'num_smilies', 'num_symbols')

columns = ('target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'funny', 'wow', 'sad',
           'likes', 'disagree', 'sexual_explicit','identity_annotator_count', 'toxicity_annotator_count')

rows = [{c:train[f].corr(train[c]) for c in columns} for f in features]
train_correlations = pd.DataFrame(rows, index=features)

Here's what our corrleation table looks like:

In [None]:
train_correlations

## Correlation with new Features

We can plot these correlations in a heatmap: (If we're not using the entire data, the heatmap may look incomplete)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
sns.set(font_scale=1)
ax = sns.heatmap(train_correlations, vmin=-0.1, vmax=0.1, center=0.0, cmap="YlGnBu")
figure = svm.get_figure()    
figure.savefig('HMAP_NewFeatures_Correlation.png', dpi=400)

## Correlation between Identities and Comment Label

In [None]:
identities = tuple(train.iloc[:, 8:32])
rows = [{c:train[f].corr(train[c]) for c in columns} for f in identities]
poptoxicity_correlations = pd.DataFrame(rows, index=identities)

plt.figure(figsize=(12, 8))
sns.set(font_scale=1)
ax = sns.heatmap(poptoxicity_correlations, vmin=-0.1, vmax=0.1, center=0.0, cmap="YlGnBu")
figure = svm.get_figure()    
figure.savefig('HMAP_Identities_Correlation.png', dpi=400)

## Demographics Analysis

We can find a rleationship between average toxicity of comments with demographic groups: (Not complete/accurate with low sample of data, we need to run this on the whole data)

In [None]:
demographics = train.loc[:, ['target']+list(train)[slice(8,32)]].dropna()
weighted_toxic = demographics.iloc[:, 1:].multiply(demographics.iloc[:, 0], axis="index").sum()/demographics.iloc[:, 1:][demographics.iloc[:, 1:]>0].count()
weighted_toxic = weighted_toxic.sort_values(ascending=False)

plt.figure(figsize=(30,20))
sns.set(font_scale=3)
ax = sns.barplot(x = weighted_toxic.values, y = weighted_toxic.index, alpha=0.8)
plt.ylabel('Demographics')
plt.xlabel('Weighted Toxic')
plt.show()
plt.savefig('Demographic_WeightedToxicity.png')

In [None]:
demographics.head()

Now we look at samples per category:

In [None]:
cat_df = pd.DataFrame()
toxicCat = []
normalCat = []
for i in range(weighted_toxic.index.size):
#     cat_df = train.apply(lambda x: True if ( (train['target']>0.5)  & (train[weighted_toxic.index[i]] > 0)) else False, axis=1)
    cat_df_bad = train[ (train['target']>0.5) & (train[weighted_toxic.index[i]] > 0.0) ]
    cat_df_good = train[ (train['target']<0.5) & (train[weighted_toxic.index[i]] > 0.0) ]
    toxicCat.append(len(cat_df_bad))
    normalCat.append(len(cat_df_good))

Now "normalCat" and "toxicCat" contains the number of normal and toxic samples per category respectively. They are ordered the same as "weighted_toxic.index"

In [None]:
print(normalCat)
print(toxicCat)

normalCat = np.asarray(normalCat)
toxicCat = np.asarray(toxicCat)
samplesPerCategory = np.vstack((normalCat, toxicCat))

print(normalCat.shape)
print(toxicCat.shape)
print(samplesPerCategory.shape)

In [None]:

plt.figure(figsize=(30,20))
sns.set(font_scale=3)
ax = sns.barplot(x = normalCat, y = weighted_toxic.index, alpha=0.8, color="blue")
ax = sns.barplot(x = toxicCat, y = weighted_toxic.index, alpha=0.8, color="red")
plt.ylabel('Demographics')
plt.xlabel('Number of Samples')
plt.savefig('Demographic_Samples.png')
plt.show()

## Time-Series Plots

First we weight the data based on dates and labels

In [None]:
withdate = train.loc[:, ['created_date', 'target']+list(train)[slice(8,32)]].dropna()
raceweighted = withdate.iloc[:, 2:]/withdate.iloc[:, 2:].sum()
race_target_weighted = raceweighted.multiply(withdate.iloc[:, 1], axis="index")
race_target_weighted['created_date'] = pd.to_datetime(withdate['created_date']).values.astype('datetime64[M]')
weighted_demo = race_target_weighted.groupby(['created_date']).sum().sort_index()

Editing Sizes of Plots:

In [None]:
SMALL_SIZE = 10
MEDIUM_SIZE = 16
BIGGER_SIZE = 20

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
title_font = {'fontname':'Arial', 'size':'18', 'color':'black', 'weight':'normal',
              'verticalalignment':'bottom'} # Bottom vertical alignment for more space

In [None]:
weighted_demo[['white', 'asian', 'black', 'jewish', 'latino', 'other_race_or_ethnicity']].plot(
    title = 'Time Series Toxicity & Race' )
plt.rcParams["figure.figsize"] = [16,9]
plt.xlabel('Date')
plt.ylabel('Weighted Toxicity')
plt.legend()
plt.savefig('Toxicity_vs_Race.png')

In [None]:
weighted_demo[['atheist', 'buddhist', 'christian', 'hindu', 'muslim', 'other_religion']].plot(
    title = 'Time Series Toxicity & Race' )
plt.rcParams["figure.figsize"] = [16,9]
plt.xlabel('Date')
plt.ylabel('Weighted Toxicity')
plt.legend()
plt.savefig('Toxicity_vs_Religion.png')

In [None]:
weighted_demo[['heterosexual', 'homosexual_gay_or_lesbian', 'bisexual', 'other_sexual_orientation']].plot(
    title = 'Time Series Toxicity & Race' )
plt.rcParams["figure.figsize"] = [16,9]
plt.xlabel('Date')
plt.ylabel('Weighted Toxicity')
plt.legend()
plt.savefig('Toxicity_vs_SexualOrientation.png')

In [None]:
weighted_demo[['male', 'female', 'transgender', 'other_gender']].plot(
    title = 'Time Series Toxicity & Race' )
plt.rcParams["figure.figsize"] = [16,9]
plt.xlabel('Date')
plt.ylabel('Weighted Toxicity')
plt.legend()
plt.savefig('Toxicity_vs_Gender.png')

In [None]:
weighted_demo[['physical_disability', 'intellectual_or_learning_disability', 'psychiatric_or_mental_illness', 'other_disability']].plot(
    title = 'Time Series Toxicity & Race' )
plt.rcParams["figure.figsize"] = [16,9]
plt.xlabel('Date')
plt.ylabel('Weighted Toxicity')
plt.legend()
plt.savefig('Toxicity_vs_Disability.png')