In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import datetime
import pandas_profiling
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
unan_comments = pd.read_csv('toxicity_annotated_comments_unanimous.tsv', sep='\t')
unan_scores = pd.read_csv('toxicity_annotations_unanimous.tsv', sep='\t')

# remove newline and tab tokens
unan_comments['comment'] = unan_comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
unan_comments['comment'] = unan_comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))

In [None]:
comment_score = pd.merge(unan_comments, unan_scores, on='rev_id')
split = comment_score.groupby('toxicity')

toxic_group = split.get_group(1).groupby('comment')
nontoxic_group = split.get_group(0).groupby('comment')

toxics = []
nontoxics = []
all_comments = comment_score['comment']

for name, group in toxic_group:
    toxics.append(name)

for name, group in nontoxic_group:
    nontoxics.append(name)

In [None]:
# create the transform
toxic_vectorizer = TfidfVectorizer()
nontoxic_vectorizer = TfidfVectorizer()
all_vectorizer = TfidfVectorizer()
# tokenize and build vocab
toxic_vectorizer.fit(toxics)
nontoxic_vectorizer.fit(nontoxics)
all_vectorizer.fit(all_comments)

In [None]:
# summarize
print(all_vectorizer.vocabulary_)
print("-------------------------")
print(all_vectorizer.idf_)

toxic_vocab_words = toxic_vectorizer.vocabulary_

In [122]:
toxic_vocab = pd.Series(toxic_vectorizer.vocabulary_)
toxic_vocab1 = pd.Series(toxic_vocab.index.values, index=toxic_vocab )
toxic_vocab_tfidf = pd.Series(toxic_vectorizer.idf_)
toxic_vocab_score = pd.concat([toxic_vocab1, toxic_vocab_tfidf], keys=['Words','Scores'], axis=1)

nontoxic_vocab = pd.Series(nontoxic_vectorizer.vocabulary_)
nontoxic_vocab1 = pd.Series(nontoxic_vocab.index.values, index=nontoxic_vocab )
nontoxic_vocab_tfidf = pd.Series(nontoxic_vectorizer.idf_)
nontoxic_vocab_score = pd.concat([nontoxic_vocab1, nontoxic_vocab_tfidf], keys=['Words','Scores'], axis=1)

all_vocab = pd.Series(all_vectorizer.vocabulary_)
all_vocab1 = pd.Series(all_vocab.index.values, index=all_vocab )
all_vocab_tfidf = pd.Series(all_vectorizer.idf_)
all_vocab_score = pd.concat([all_vocab1, all_vocab_tfidf], keys=['Words','Scores'], axis=1)

non_toxic = pd.merge(nontoxic_vocab_score, toxic_vocab_score, on="Words")
toxic_all = pd.merge(toxic_vocab_score, all_vocab_score, on="Words")
non_all = pd.merge(nontoxic_vocab_score, all_vocab_score, on="Words")

In [123]:
score_diff_non_toxic = []
score_diff_non_all = []
score_diff_toxic_all = []


for index, row in non_toxic.iterrows():
    score_diff_non_toxic.append(row['Scores_x'] - row['Scores_y'])

for index, row in toxic_all.iterrows():
    score_diff_toxic_all.append(row['Scores_y'] - row['Scores_x'])
    
for index, row in non_all.iterrows():
    score_diff_non_all.append(row['Scores_y'] - row['Scores_x'])
    
non_toxic['Score Diff, Toxic and Non-Toxic'] = score_diff_non_toxic
toxic_all['Score Diff, All and Toxic'] = score_diff_toxic_all
non_all['Score Diff, All and Non'] = score_diff_non_all

In [125]:
toxic_words = toxic_all['Score Diff, All and Toxic'] > 3.6

with pd.option_context('display.max_rows', None):
    display(toxic_all[toxic_words])

Unnamed: 0,Words,Scores_x,Scores_y,"Score Diff, All and Toxic"
0,087209,5.454347,9.06957,3.615223
5,1991,5.454347,9.06957,3.615223
6,1a,5.454347,9.06957,3.615223
14,375,5.454347,9.06957,3.615223
17,4000,5.454347,9.06957,3.615223
21,8_prime_spirals,5.454347,9.06957,3.615223
27,absorbed,5.454347,9.06957,3.615223
29,abusive,5.454347,9.06957,3.615223
30,abysmal,5.454347,9.06957,3.615223
36,accursed,5.454347,9.06957,3.615223
