In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [174]:
import classifier

In [173]:
import nltk

array(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'], dtype=object)

In [4]:
from pyfasttext import FastText

# preprocess data

In [5]:
subset_train = 1
max_features = 50000

In [6]:
datapath = '../data/'

In [7]:
train = pd.read_csv(datapath + 'train.csv')
test = pd.read_csv(datapath + 'test.csv')
sub = pd.read_csv(datapath + 'sample_submission.csv')
train = train.iloc[:int(len(train) * subset_train), :]
test = test.fillna('unknown')
comment_types = train.columns[2:].values

In [157]:
X = train.comment_text.values
y = train.loc[:, comment_types].values
X_train, X_val, y_train, y_val = train_test_split(X,
                                                y,
                                                    random_state=42,
                                                    stratify=np.sum(y, 1),
                                                    test_size=0.2)

In [158]:
result = np.zeros((len(X_train), len(comment_types))).astype(str)
for i, comment_type in enumerate(comment_types):
    result[:, i] = np.where(y_train[:, i],
                            comment_type,
                            'not' + comment_type)

In [159]:
lines = [X_train[i].strip().replace("\n", '').replace('"','') + ' ' +   '__label__' + ' __label__'.join(row) + '\n' for i, row in enumerate(result)]

In [11]:
file = open(datapath + 'fasttext2.csv', 'w')
with file:
    for line in lines:
        file.write(line)
    

In [12]:
model = FastText()

In [20]:
model.supervised(input=datapath + 'fasttext2.csv',
                 output=datapath,
                lr=0.5,
                epoch=100)

In [21]:
val_lines = [row.strip().replace("\n", '').replace('"','') + '\n'  for  row in X_val]
probas = model.predict_proba(val_lines, k=None)

In [22]:
probas_positive = [[(label, proba) for label, proba in sentence if label[:3] != 'not'] for sentence in probas ]

In [23]:
result_dict = dict(zip(comment_types, [[], [], [], [], [], []]))
for tuple_list in probas_positive:
    if tuple_list:
        for comment_type, proba in tuple_list:
            result_dict[comment_type].append(proba)
    else:
        for comment_type in comment_types:
            result_dict[comment_type].append(.5)

In [24]:
result_df = pd.DataFrame(result_dict)

In [25]:
result_df = result_df.loc[:, comment_types]

In [26]:
classifier.lloss(result_df.values, y_val)

0.19765947560497513

## one by one

In [36]:
# result [:, i], contains toxix/nottoxic

array([['nottoxic', 'notsevere_toxic', 'notobscene', 'notthreat',
        'notinsult', 'notidentity_hate'],
       ['nottoxic', 'notsevere_toxic', 'notobscene', 'notthreat',
        'notinsult', 'notidentity_hate'],
       ['nottoxic', 'notsevere_toxic', 'notobscene', 'notthreat',
        'notinsult', 'notidentity_hate'],
       ..., 
       ['nottoxic', 'notsevere_toxic', 'notobscene', 'notthreat',
        'notinsult', 'notidentity_hate'],
       ['nottoxic', 'notsevere_toxic', 'notobscene', 'notthreat',
        'notinsult', 'notidentity_hate'],
       ['nottoxic', 'notsevere_toxic', 'notobscene', 'notthreat',
        'notinsult', 'notidentity_hate']],
      dtype='<U32')

In [160]:
from nltk.tokenize import TweetTokenizer
tknzer = TweetTokenizer(preserve_case=False,
                       reduce_len=True,
                       strip_handles=True)


In [167]:
import string

def simplify(text, tokenizer):
    translate_table = dict((ord(char), None) for char in string.punctuation)
    tokenized = tokenizer.tokenize(text)
    sentenced = ' '.join(tokenized)
    return sentenced.strip().replace("\n", '').replace('"', '').translate(translate_table)

In [168]:
lines = []
for i, comment_type in enumerate(comment_types):
    line_thiscomment = [simplify(X_train[j], tknzer) + ' ' +   '__label__' + label + ' \n' for j, label in enumerate(result[:, i])]
    lines.append(line_thiscomment)
 
    

In [169]:
for i, comment_type in enumerate(comment_types):
    file = open(datapath + comment_type + '.txt', 'w')
    with file:
        for line in lines[i]:
            file.write(line)
    

In [170]:
results = np.zeros((len(X_val), len(comment_types))).astype(float)
for i, comment_type in enumerate(comment_types):
    print('computing {}'.format(comment_type))
    model = FastText()
    model.supervised(input=datapath + comment_type + '.txt',
                     output=datapath,
                    lr=0.1,
                    epoch=8,
                    minn=3,
                    maxn=6,
                    minCount=3,
                    wordNgrams=2)
    val_lines = [simplify(row, tknzer) + '\n'  for  row in X_val]
    probas = model.predict_proba(val_lines, k=None)
    positive_probas = list(map(lambda x: dict(x)[comment_type], probas))
    results[:, i] = positive_probas

computing toxic
computing severe_toxic
computing obscene
computing threat
computing insult
computing identity_hate


In [171]:
result_df = pd.DataFrame(results)
result_df.columns = comment_types

In [172]:
classifier.lloss(result_df.values, y_val)

0.057749417211966407

In [67]:
probas[0]

[('notidentity_hate', 0.9492187483403163),
 ('identity_hate', 0.04882813874112768)]

In [None]:
# don't preserve case: .05858
# epochs from 8: 0.0577