In [1]:
import pandas as pd

# Text processing and visualization
import texthero as hero
from texthero import preprocessing
# Linear Discrimanation Analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [2]:
# Global vars
data_dir = "../../data/"
train_csv = data_dir + "train.csv"
test_csv = data_dir + "test.csv"
test_labels_csv = data_dir + "test_labels.csv"

In [42]:
train = pd.read_csv(train_csv)
test = pd.read_csv(test_csv)
test_labels = pd.read_csv(test_labels_csv)
print(train.shape)
print(test.shape)
print(test_labels.shape)
# Sample of data
# train = train.head(2000)

(159571, 8)
(153164, 2)
(153164, 7)


In [43]:
INDEX=700
train.loc[[500, 502, 503, 509, 515, 610]]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
500,0150b1ffd804dc7e,Protecting 'Causes of the 1948 Palestinian exo...,0,0,0,0,0,0
502,01513d378cb84f20,"""\n My thanks to you for this. I note that I a...",0,0,0,0,0,0
503,0152f55962c4f707,I don't care what you say here. I don't believ...,1,0,0,0,0,0
509,0154f8bd2a16193b,"""\nTo quote \n\n- If a Wikipedia article links...",0,0,0,0,0,0
515,01588a3bc3e2755d,"""\n\n =^_^= \n\nI just watched the entire seri...",0,0,0,0,0,0
610,01a04eaa73dc98ab,DISLIKE RACISM AND I CAN SEE THAT YOU HAS PROB...,1,0,0,0,0,0


In [44]:
# Cleaning the data
custom_pipeline = [preprocessing.fillna,
                   preprocessing.lowercase,
                   preprocessing.remove_digits,
                   preprocessing.remove_punctuation,
                   preprocessing.remove_diacritics,
                   preprocessing.remove_brackets,
                   preprocessing.remove_urls,
                   preprocessing.remove_stopwords,
                   preprocessing.remove_whitespace]

train['text'] = hero.clean(train['comment_text'], custom_pipeline)

In [46]:
# Vectorize it
train['tfidf'] = (
  hero.tfidf(train['text'], max_features=100)
)
train.shape

(159571, 10)

In [47]:
# Reduce dimensions and visualize vector space (PCA)
train['pca'] = hero.pca(train['tfidf'])
hero.scatterplot(
  train,
  col='pca',
  color='toxic',
  title='PCA'
)

In [16]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,text,tfidf,pca,lda,named_entities
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation edits made username hardcore metal...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.31032541007850123, 0.05430278705311392]",<class 'sklearn.discriminant_analysis.LinearDi...,"[(Hardcore Metallica Fan, PERSON, 49, 71), (Ne..."
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,aww matches background colour seemingly stuck ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.38548210660993315, -0.06913841356569021]",<class 'sklearn.discriminant_analysis.LinearDi...,"[(21:51,, TIME, 83, 89), (January 11, 2016, DA..."
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man really trying edit war guy constantly ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.3247262686035098, 0.04570241875951811]",<class 'sklearn.discriminant_analysis.LinearDi...,[]
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,make real suggestions improvement wondered sec...,"[0.0, 0.0, 0.0, 0.0, 0.18437963807328078, 0.25...","[-0.15655527994689156, 0.15687418123579683]",<class 'sklearn.discriminant_analysis.LinearDi...,"[(first, ORDINAL, 309, 314)]"
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,sir hero chance remember page,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.23294633491692102, 0.06691923004995692]",<class 'sklearn.discriminant_analysis.LinearDi...,[]


In [17]:
train['named_entities'] = (
    hero.named_entities(train['text'])
)

In [18]:
train[['named_entities', 'toxic']].head(10)

Unnamed: 0,named_entities,toxic
0,"[(new york, GPE, 93, 101)]",0
1,"[(january, DATE, 58, 65)]",0
2,[],0
3,"[(first, ORDINAL, 176, 181)]",0
4,[],0
5,[],0
6,[],1
7,"[(matt shirvington, PERSON, 10, 26)]",0
8,[],0
9,[],0


In [20]:
NUM_TOP_WORDS = 20
hero.top_words(train['text'])[:NUM_TOP_WORDS]

article      739
wikipedia    621
page         564
talk         471
one          412
would        363
please       357
like         341
see          285
also         265
edit         243
think        241
people       230
know         226
use          217
articles     211
may          205
time         205
user         183
thanks       182
Name: text, dtype: int64

In [21]:
# Visulalize the most used words for each topic
NUM_TOP_WORDS = 10
print(train.groupby('toxic')['text'].apply(lambda x: hero.top_words(x)[:NUM_TOP_WORDS]))
print(train.groupby('severe_toxic')['text'].apply(lambda x: hero.top_words(x)[:NUM_TOP_WORDS]))
print(train.groupby('obscene')['text'].apply(lambda x: hero.top_words(x)[:NUM_TOP_WORDS]))
print(train.groupby('threat')['text'].apply(lambda x: hero.top_words(x)[:NUM_TOP_WORDS]))
print(train.groupby('insult')['text'].apply(lambda x: hero.top_words(x)[:NUM_TOP_WORDS]))
print(train.groupby('identity_hate')['text'].apply(lambda x: hero.top_words(x)[:NUM_TOP_WORDS]))

toxic           
0      article      723
       wikipedia    585
       page         537
       talk         458
       one          388
       please       351
       would        351
       like         306
       see          269
       also         256
       edit         227
       think        224
       use          214
       articles     210
       know         207
       people       207
       may          202
       time         188
       thanks       181
       user         176
1      fuck          41
       wikipedia     36
       like          35
       fucking       32
       get           31
       go            27
       page          27
       one           24
       people        23
       shit          19
       know          19
       gay           18
       think         17
       time          17
       u             17
       ass           16
       stop          16
       article       16
       see           16
       edit          16
Name: text, dtype: int6