In [1]:
import pandas as pd, numpy as np
from pyfasttext import FastText
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [2]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)
train[COMMENT] = train[COMMENT].str.lower().str.replace('https?:\/\/[^\s]*','').str.replace("’","'").str.replace("''",' ').str.replace("'"," ' ").str.replace('“','"').str.replace('”','"').str.replace('"',' : ').str.replace('.',' . ').str.replace(',',' , ').str.replace('[',' [ ').str.replace(']',' ] ').str.replace('(',' ( ').str.replace(')',' ) ').str.replace('!',' ! ').str.replace('?',' ? ').str.replace(';',' ').str.replace(':',' ').str.replace('-',' - ').str.replace('=', ' ').str.replace('=', ' ').str.replace('*', ' ').str.replace('|', ' ').str.replace('«', ' ').str.replace('\d', ' ').str.replace('\n', ' ').str.replace('\s\s+',' ').str.strip()
test[COMMENT] = test[COMMENT].str.lower().str.replace('https?:\/\/[^\s]*','').str.replace("’","'").str.replace("''",' ').str.replace("'"," ' ").str.replace('“','"').str.replace('”','"').str.replace('"',' : ').str.replace('.',' . ').str.replace(',',' , ').str.replace('[',' [ ').str.replace(']',' ] ').str.replace('(',' ( ').str.replace(')',' ) ').str.replace('!',' ! ').str.replace('?',' ? ').str.replace(';',' ').str.replace(':',' ').str.replace('-',' - ').str.replace('=', ' ').str.replace('=', ' ').str.replace('*', ' ').str.replace('|', ' ').str.replace('«', ' ').str.replace('\d', ' ').str.replace('\n', ' ').str.replace('\s\s+',' ').str.strip()

In [3]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['clean'] = 1-train[label_cols].max(axis=1)
train.describe()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean
count,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0
mean,499435900000.0,0.096368,0.010068,0.053301,0.003182,0.049713,0.008492,0.897862
std,289013600000.0,0.295097,0.099832,0.224635,0.05632,0.217352,0.091762,0.302831
min,22256640.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,247343700000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,500129700000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,750108800000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,999988200000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# train vectors

In [24]:
train_comment = train.copy()
train_comment.drop(label_cols, 1, inplace=True)
train_comment.drop(['clean','label'],1, inplace=True)
all_comment = pd.concat([train_comment,test])
all_comment.to_csv('all_comments.csv', columns=['comment_text'], index = False)

In [25]:
model = FastText()
model.skipgram(input='all_comments.csv', output='comments_skipgram', minn='1', maxn='5', wordNgrams='2')

In [26]:
train_df = None
val_df = None
for label in label_cols:
    train['label'] = ""
    train.loc[(train[label]==1), 'label'] = "__label__" + label + " "
    train.loc[(train[label]==0), 'label'] = "__label__clean" + " "
    train_df, val_df = train_test_split(train, test_size=0.2, random_state=1)
    train_df.to_csv("fasttext_train_"+label+".csv", columns=['label', 'comment_text'], index=False)    
    train.to_csv("fasttext_train_all_"+label+".csv", columns=['label', 'comment_text'], index=False)    

In [27]:
def model_name(target, param):
    name = target
    for key in param.keys():
        name += "_"+key+str(param[key])
    return name

In [30]:
def train_model(param, target, all_sample = False):
    model = FastText()    
    if all_sample == False:
        model.supervised(input='fasttext_train_'+target+'.csv', output='parameter_search', epoch=param["epoch"], minn=param["minn"], maxn=param["maxn"], wordNgrams=param["wordNgrams"], dim=param["dim"])
    else:
        model.supervised(input='fasttext_train_all_'+target+'.csv', output='parameter_search', epoch=param["epoch"], minn=param["minn"], maxn=param["maxn"], wordNgrams=param["wordNgrams"], dim=param["dim"])        
    return model

In [None]:
def calculate_log_loss(model, target):
    predict_probas = model.predict_proba(val_df[COMMENT],k=2) 
    pred = []
    for predict in predict_probas:
        pred_prob = 0
        for label, prob in predict:
            if label == target:         
                pred_prob = prob
        pred.append(pred_prob) 
    return log_loss(val_df[target],pred)

In [None]:
import random
param_grid = {}
param_grid["minn"] = [1]
param_grid["maxn"] = [5]
param_grid["epoch"] = [1,2,3,5]
param_grid["lr"] = [0.01, 0.1]
#targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
targets = ['toxic']

log_loss_df_columns = ['target','log_loss']
for key in param_grid.keys():
    log_loss_df_columns.append(key)

log_loss_df = pd.DataFrame(columns=log_loss_df_columns)


for target in targets:
    print("parameter search",target)    
    max_test = 20
    tested_params = []    
    while len(tested_params) < max_test:        
        param = random.choice(list(ParameterGrid(param_grid)))                     
        test_model_name = model_name(target, param)
        if test_model_name in tested_parm:
            continue
        else:
            tested_params.append(test_model_name)
            
        model = train_model(param, target)
        score = calculate_log_loss(model, target)
        
        log_loss_df_row = {}
        log_loss_df_row['target'] = target
        log_loss_df_row['log_loss'] = score
        for key in param.keys():
            log_loss_df_row[key] = param[key]
        log_loss_df = log_loss_df.append(log_loss_df_row, ignore_index=True)        
        log_loss_df.to_csv("parameter_random_search.csv", index=False)    
        print(score, param)   

In [31]:
param_grid = {}
param_grid["minn"] = [1]
param_grid["maxn"] = [5]
param_grid["epoch"] = [8]
param_grid["lr"] = [0.1,0.5]
param_grid["wordNgrams"] =[2,3]
param_grid["dim"] =[100]
#targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
targets = ['toxic']

log_loss_df_columns = ['target','log_loss']
for key in param_grid.keys():
    log_loss_df_columns.append(key)

log_loss_df = pd.DataFrame(columns=log_loss_df_columns)

for target in targets:
    print("parameter search",target)    
    #TODO: change to randomize the list
    for param in list(ParameterGrid(param_grid)):                     
        model = train_model(param, target)
        score = calculate_log_loss(model, target)
        
        log_loss_df_row = {}
        log_loss_df_row['target'] = target
        log_loss_df_row['log_loss'] = score
        for key in param.keys():
            log_loss_df_row[key] = param[key]
        log_loss_df = log_loss_df.append(log_loss_df_row, ignore_index=True)        
        log_loss_df.to_csv("parameter_grid_search.csv", index=False)    
        print(score, param)    

parameter search toxic
0.142080725536 {'maxn': 5, 'wordNgrams': 2, 'epoch': 8, 'minn': 1, 'dim': 100, 'lr': 0.1}
0.144742003529 {'maxn': 5, 'wordNgrams': 3, 'epoch': 8, 'minn': 1, 'dim': 100, 'lr': 0.1}
0.146431145526 {'maxn': 5, 'wordNgrams': 2, 'epoch': 8, 'minn': 1, 'dim': 100, 'lr': 0.5}
0.144049925721 {'maxn': 5, 'wordNgrams': 3, 'epoch': 8, 'minn': 1, 'dim': 100, 'lr': 0.5}


In [228]:
log_loss_df.groupby('target')['log_loss'].min().sum()/6

0.1012490261837956

In [233]:
log_loss_df.groupby('target')['log_loss'].min()

target
identity_hate    0.040433
insult           0.142292
obscene          0.134799
severe_toxic     0.041574
threat           0.018249
toxic            0.230147
Name: log_loss, dtype: float64

In [187]:
# label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
log_loss_df[log_loss_df['target']=='identity_hate'].sort_values('log_loss',ascending=True)

Unnamed: 0,target,log_loss,lr,epoch,maxn,minn
256,identity_hate,0.040433,0.1,5,3,2
261,identity_hate,0.040692,0.1,5,5,1
241,identity_hate,0.041091,0.1,3,3,2
258,identity_hate,0.041208,0.1,5,4,1
229,identity_hate,0.041644,0.1,2,4,2
264,identity_hate,0.042095,0.1,5,6,1
246,identity_hate,0.042737,0.1,3,5,1
267,identity_hate,0.042974,0.1,5,7,1
243,identity_hate,0.042981,0.1,3,4,1
249,identity_hate,0.04332,0.1,3,6,1


# Predict

In [26]:
comment_ids = []
comments = []
for index, row in test.iterrows():
    comments.append(row['comment_text'])
    comment_ids.append(row['id'])

predict_probas = {}
param = {}
param['toxic'] = {'maxn': 5, 'wordNgrams': 3, 'lr': 0.1, 'minn': 1, 'epoch': 8}
param['severe_toxic'] = {'epoch':3, 'maxn':5, 'minn':1, 'wordNgrams': 3}
param['obscene'] = {'epoch':2, 'maxn':5, 'minn':1, 'wordNgrams': 3}
param['threat'] = {'epoch':2, 'maxn':4, 'minn':1, 'wordNgrams': 3}
param['insult'] = {'epoch':2, 'maxn':4, 'minn':1, 'wordNgrams': 3}
param['identity_hate'] = {'epoch':5, 'maxn':3, 'minn':2, 'wordNgrams': 3}

for target in label_cols:
    model = train_model(param[target], target, all_sample = True)
    print(model.labels)
    predict_probas[target] = model.predict_proba(comments,k=2)   
    del model

['clean', 'toxic']
['clean', 'severe_toxic']
['clean', 'obscene']
['clean', 'threat']
['clean', 'insult']
['clean', 'identity_hate']


In [27]:
import csv
with open('submit.csv', "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    writer.writerow(['id','toxic','severe_toxic','obscene','threat','insult','identity_hate'])
    csv_rows = []
    for index, comment_id in enumerate(comment_ids):
        #print('processing',index)
        prob_dict = {}        
        #print(comments[index])
        for label in label_cols:       
            #print(predict_probas[label][index])            
            for label_predict, prob in predict_probas[label][index]:
                if label_predict != 'clean':    
                    prob_dict[label] = prob
        csv_row=[]
        csv_row.append(comment_id)
        
        for label in label_cols:
            if label in prob_dict:
                csv_row.append(prob_dict[label])
            else:
                csv_row.append(0)
                #print('no prediction:',index,'comment:',comments[index])
                #csv_row.append(prob_dict[label])
        csv_rows.append(csv_row)    
    writer.writerows(csv_rows)