In [39]:
import pandas as pd, numpy as np

In [40]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
subm = pd.read_csv('input/sample_submission.csv')

COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

In [42]:
train[COMMENT] = train[COMMENT].str.lower()
test[COMMENT] = test[COMMENT].str.lower()

In [43]:
train[COMMENT] = train[COMMENT].str.replace('https?:\/\/[^\s]*','')
test[COMMENT] = test[COMMENT].str.replace('https?:\/\/[^\s]*','')

In [25]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"nonsense? kiss off, geek. what i said is true...",1,0,0,0,0,0
1,27450690,"""\n\n please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"points of interest"""" \n\ni removed the...",0,0,0,0,0,0
3,77493077,asking some his nationality is a racial offenc...,0,0,0,0,0,0
4,79357270,the reader here is not going by my say so for ...,0,0,0,0,0,0


Prepare FastText training data

In [26]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['clean'] = 1-train[label_cols].max(axis=1)
train.describe()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean
count,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0
mean,499435900000.0,0.096368,0.010068,0.053301,0.003182,0.049713,0.008492,0.897862
std,289013600000.0,0.295097,0.099832,0.224635,0.05632,0.217352,0.091762,0.302831
min,22256640.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,247343700000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,500129700000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,750108800000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,999988200000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [27]:
labels = train.columns[2:9]
train['label'] = ""
for label in labels:    
    train.loc[(train[label]==1), 'label'] += "__label__" + label + " "

In [28]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean,label
0,22256635,"nonsense? kiss off, geek. what i said is true...",1,0,0,0,0,0,0,__label__toxic
1,27450690,"""\n\n please do not vandalize pages, as you di...",0,0,0,0,0,0,1,__label__clean
2,54037174,"""\n\n """"points of interest"""" \n\ni removed the...",0,0,0,0,0,0,1,__label__clean
3,77493077,asking some his nationality is a racial offenc...,0,0,0,0,0,0,1,__label__clean
4,79357270,the reader here is not going by my say so for ...,0,0,0,0,0,0,1,__label__clean


In [29]:
train.to_csv("fasttext_train.csv", columns=['label', 'comment_text'], index=False)

In [30]:
from pyfasttext import FastText

In [31]:
# https://github.com/facebookresearch/fastText/blob/master/docs/supervised-tutorial.md
model = FastText()
model.supervised(input='fasttext_train.csv', output='fasttext_model', epoch=500, lr=0.7)

In [32]:
model.labels

['clean',
 'toxic',
 'obscene',
 'insult',
 'severe_toxic',
 'identity_hate',
 'threat']

In [33]:
model.predict_proba(['first sentence\nsecond sentence\n'], k=7)

[[('clean', 1.0),
  ('insult', 1.9531265169053625e-08),
  ('severe_toxic', 1.9531265169053625e-08),
  ('threat', 1.9531265169053625e-08),
  ('identity_hate', 1.9531265169053625e-08),
  ('obscene', 1.9531265169053625e-08),
  ('toxic', 1.9531265169053625e-08)]]

predict

In [61]:
pred = test.copy()
del pred['comment_text']
for label in label_cols:
    pred[label] = 0
    
comments = []
indexes = []
for index, row in test[1:100].iterrows():
    print(index)
    comments.append(row['comment_text'])
    indexes.append(index)
    for label, prob in model.predict_proba([row['comment_text']], k=7)[0]:        
        if label != 'clean':
            pred.loc[index, label]=prob            

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [55]:
pred.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,1.953127e-08,1.953127e-08,1.953127e-08,1.953127e-08,1.953127e-08,1.953127e-08
1,6102620,1.953127e-08,1.953127e-08,1.953127e-08,1.953127e-08,1.953127e-08,1.953127e-08
2,14563293,1.953127e-08,1.953127e-08,1.953127e-08,1.953127e-08,1.953127e-08,1.953127e-08
3,21086297,1.953127e-08,1.953127e-08,1.953127e-08,1.953127e-08,1.953127e-08,1.953127e-08
4,22982444,1.953127e-08,0.007812519,0.01562502,1.953127e-08,0.01562502,0.01171877


In [53]:
pred.to_csv("submission.csv", columns=['id','toxic','severe_toxic','obscene','threat','insult','identity_hate'], index=False)