In [1]:
import pandas as pd
import numpy as np

from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

from random import shuffle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score,StratifiedKFold,train_test_split
from scipy.sparse import hstack
from sklearn.metrics import f1_score,recall_score,precision_score,accuracy_score,auc,roc_curve

import string
import re
from unidecode import unidecode

"""
import tensorflow as tf

from itertools import cycle 

from __future__ import division, print_function
"""
from tqdm import tqdm
from tqdm import trange
import scipy
from copy import deepcopy

In [2]:
np.random.seed(1)

In [3]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [4]:
data_train = pd.read_csv("train.csv")
data_test = pd.read_csv("test.csv")

In [5]:
data_train = data_train.fillna(" ")
data_test =data_test.fillna(" ")

In [6]:
data_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
data_test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [8]:
data_train.shape[0]

159571

In [9]:
train_C = data_train.comment_text
test_C =  data_test.comment_text

In [10]:
train_C.shape, test_C.shape

((159571,), (153164,))

In [11]:
train_C.values[0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [12]:
np.sum(train_C.duplicated()) #Дубликатов нет

0

In [13]:
np.sum(test_C.duplicated()) #Дубликатов нет

0

In [14]:
np.sum(train_C.isnull())

0

In [15]:
# PREPROCESSING PART
repl = {
    "&lt;3": " good ",
    ":d": " good ",
    ":dd": " good ",
    ":p": " good ",
    "8)": " good ",
    ":-)": " good ",
    ":)": " good ",
    ";)": " good ",
    "(-:": " good ",
    "(:": " good ",
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    ":-(": " bad ",
    ":(": " bad ",
    ":s": " bad ",
    ":-s": " bad ",
    "&lt;3": " heart ",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    ":/": " worry ",
    ":&gt;": " angry ",
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
    r"\bi'm\b": "i am",
    "m": "am",
    "r": "are",
    "u": "you",
    "haha": "ha",
    "hahaha": "ha",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "can not",
    "cannot": "can not",
    "i'm": "i am",
    "m": "am",
    "i'll" : "i will",
    "its" : "it is",
    "it's" : "it is",
    "'s" : " is",
    "that's" : "that is",
    "weren't" : "were not",
}

keys = [i for i in repl.keys()]

In [16]:
##Data preprocessing##

# drop urls,punctuations,digits,special symbols in questions
prep_c_train = []
prep_c_test = []
for c_train in train_C.values:

    # "ＷＨＡＴＡ  ＦＵＣＫ  ＭＡＮ" --> "WHATA FUCK MAN"
    c_train = unidecode(c_train)
    #to lowercase
    c_train = c_train.lower()

    #drop urls
    c_train = re.sub(r'http(s)?:\/\/\S*? ', " ", c_train)
    #preprocessing with according to repl
    temp = []
    for word in c_train.split():
        if word in keys:
            temp += [repl[word]]
        else:
            temp += [word]

    c_train = deepcopy(" ".join(temp))
    #drop digits - try dont'change
    c_train = ''.join([i for i in c_train if not i.isdigit()])
    #drop punctuations except apostrophes
    p = re.compile(r"(\b[-']\b)|[\W_]")
        
    prep_c_train += [p.sub(lambda m: (m.group(1) if m.group(1) else " "), c_train)]

for c_test in test_C.values:
    # "ＷＨＡＴＡ  ＦＵＣＫ  ＭＡＮ" --> "WHATA FUCK MAN"
    c_test = unidecode(c_test)
    #to lowercase
    c_test = c_test.lower()

    #drop urls
    c_test = re.sub(r'http(s)?:\/\/\S*? ', " ", c_test) 
    #preprocessing with according to repl
    temp = []
    for word in c_test.split():
        if word in keys:
            temp += [repl[word]]
        else:
            temp += [word]

    c_test = deepcopy(" ".join(temp))
    #drop digits
    c_test = ''.join([i for i in c_test if not i.isdigit()])
    #drop punctuations except apostrophes
    p = re.compile(r"(\b[-']\b)|[\W_]")
        
    prep_c_test += [p.sub(lambda m: (m.group(1) if m.group(1) else " "), c_test)]
           

# Doc2veC

In [15]:
from nltk.stem import PorterStemmer

ps = PorterStemmer() #стемминг ухудшил качество

In [22]:
##DATA PREPARATION for Doc2vec##
prep_c_train_split = []
for i in range(len(prep_c_train)):
    temp = []
    for word in prep_c_train[i].split():
        temp += [word]

    prep_c_train_split += [temp]

prep_c_train_d2v = []
for i in range(len(prep_c_train)):
    prep_c_train_d2v.append(" ".join(prep_c_train_split[i]))

with open("TRAIN_without_lemma_stem_labeled_line_sentence.txt", "w") as f:
    f.writelines(c + "\n" for c in prep_c_train_d2v)
    
prep_c_test_split = []
for i in range(len(prep_c_test)):
    temp = []
    for word in prep_c_test[i].split():
        temp += [word]

    prep_c_test_split += [temp]

prep_c_test_d2v = []
for i in range(len(prep_c_test)):
    prep_c_test_d2v.append(" ".join(prep_c_test_split[i]))

with open("TEST_without_lemma_stem_labeled_line_sentence.txt", "w") as f:
    f.writelines(c + "\n" for c in prep_c_test_d2v)

In [23]:
len(prep_c_train_d2v)

159571

In [24]:
len(prep_c_test_d2v)

153164

In [25]:
data_test.shape, data_train.shape

((153164, 2), (159571, 8))

In [27]:
##DOC2VEC##

#Modernization of LabeledLineSentence
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    """The model is better trained if in each training epoch,
    the sequence of sentences fed to the model is randomized.
    This is important: missing out on this steps gives you really shitty results. This is the reason for the sentences_perm method in our LabeledLineSentences class."""
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences
#Also it was added opportunity to work with multiple documents

In [26]:
SIZE = 1000

In [29]:
sources = {'TRAIN_without_lemma_stem_labeled_line_sentence.txt':'TRAIN',\
          'TEST_without_lemma_stem_labeled_line_sentence.txt':'TEST'}
sentences = LabeledLineSentence(sources)

model = Doc2Vec(size=100, dbow_words= 1, dm=0, iter=1,  window=5, \
                seed=1337, min_count=1, workers=4,alpha=0.025, min_alpha=0.025)

model.build_vocab(sentences.to_array())
#model training
for epoch in range(10):
    print("epoch "+str(epoch))
    model.train(sentences.sentences_perm(),total_examples=model.corpus_count,epochs = 1)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay
    


epoch 0
epoch 1
epoch 2
epoch 3
epoch 4
epoch 5
epoch 6
epoch 7
epoch 8
epoch 9


In [None]:
model.save('toxic.d2v')

In [27]:
#Loading of Model
model = Doc2Vec.load('toxic.d2v')

In [28]:
# checking of Model
print(model.docvecs.most_similar(model.infer_vector("what a motherfucking piece".split()).reshape(1,-1),topn=5)) 

[('TRAIN_115566', 0.8503616452217102), ('TRAIN_35921', 0.8322666883468628), ('TEST_46047', 0.8309272527694702), ('TRAIN_8846', 0.829414427280426), ('TEST_20784', 0.8283772468566895)]


In [29]:
data_train.shape[0]

159571

In [30]:
train_arrays = np.zeros((train_C.shape[0], SIZE))
train_labels = np.zeros(train_C.shape[0])
for i in range(train_C.shape[0]):
    prefix_train = 'TRAIN_' + str(i)
    train_arrays[i] = model.docvecs[prefix_train]
    

In [31]:
test_arrays = np.zeros((test_C.shape[0], SIZE))
test_labels = np.zeros(test_C.shape[0])
for i in range(test_C.shape[0]):
    prefix_test = 'TEST_' + str(i)
    test_arrays[i] = model.docvecs[prefix_test]


In [32]:
skf = StratifiedKFold(5,shuffle=True,random_state=777)

In [33]:
losses = []
predictions = {'id': data_test['id']}

# LogisticRegression

In [34]:
import gc
gc.collect()

352

In [35]:
losses = []
predictions = {'id': data_test['id']}

In [36]:
for class_name in class_names:
    train_target = data_train[class_name]
    classifier = LogisticRegression(solver='sag')

    cv_loss = np.mean(cross_val_score(classifier, train_arrays, train_target, cv=skf, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))

    classifier.fit(train_arrays, train_target)
    predictions[class_name] = classifier.predict_proba(test_arrays)[:, 1]

print('Total CV score is {}'.format(np.mean(losses)))

CV score for class toxic is 0.9668031199108047
CV score for class severe_toxic is 0.9823977857778521
CV score for class obscene is 0.9810258985339757
CV score for class threat is 0.9803818370427708
CV score for class insult is 0.9762038866419653
CV score for class identity_hate is 0.9717900805427938
Total CV score is 0.9764337680750271


- Total CV score is 0.976895749698627, 0.9711(PB) ---"sag", feature size -- 1000

- Total CV score is 0.9784760833887033, 0.9723(PB) ---"sag", feature size -- 10000

- Total CV score is 0.9779829329839521, "sag", feature size -- 1000, "ＷＨＡＴＡ  ＦＵＣＫ  ＭＡＮ" --> "WHATA FUCK MAN"

- Total CV score is 0.9781456124579204, "sag", feature size -- 1000, "ＷＨＡＴＡ  ＦＵＣＫ  ＭＡＮ" --> "WHATA FUCK MAN", smiles and abbreviations transformation

In [None]:
submission = pd.DataFrame.from_dict(predictions)
submission.to_csv('not_tuned_log_reg_on_d2v_output.csv', index=False)

# XGBoost

In [22]:
from xgboost import XGBClassifier
from sklearn.decomposition import TruncatedSVD



In [31]:
import gc
gc.collect()

723

In [29]:
losses = []
predictions = {'id': data_test['id']}

In [24]:
#%%time
#svd = TruncatedSVD(n_components=200,n_iter=10,random_state=1)
#svd.fit(train_arrays)

TruncatedSVD(algorithm='randomized', n_components=200, n_iter=10,
       random_state=1, tol=0.0)

In [32]:
%%time
for class_name in class_names:
    train_target = data_train[class_name]
    classifier =XGBClassifier()

    cv_loss = np.mean(cross_val_score(classifier, svd.transform(train_arrays), train_target, cv=skf, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))

    classifier.fit(svd.transform(train_arrays), train_target)
    predictions[class_name] = classifier.predict_proba(svd.transform(test_arrays))[:, 1]

print('Total CV score is {}'.format(np.mean(losses)))

CV score for class toxic is 0.9557625085446407
CV score for class severe_toxic is 0.9806856990094197
CV score for class obscene is 0.9673842180298122
CV score for class threat is 0.9730923462536871
CV score for class insult is 0.965890424679704
CV score for class identity_hate is 0.9673515955082559
Total CV score is 0.9683611320042532
CPU times: user 2h 45min 16s, sys: 36.1 s, total: 2h 45min 52s
Wall time: 21min 33s


# Bayes NN

In [97]:
# функция, чтобы получить NN
def get_net(mode, ops, Xtr_shape, NUM_CLASSES=2, NEURON_NUMBER=800):
    with tf.variable_scope('net'):
        x = tf.placeholder(tf.float32, [None, Xtr_shape[1]])
        y = tf.placeholder(tf.int32, [None])
        
        ops['x'] = x
        ops['y'] = y
        
        if mode == 'implicit': # это соответствует статье "Implicit Weight Uncertainty in Neural Networks"
            g1 = ops['g1']
            g2 = ops['g2']
            g3 = ops['g3']

            
            # layer 1
            v1 = tf.get_variable('v1', [Xtr_shape[1], NEURON_NUMBER], tf.float32, #NEURON_NUMBER is число нейронов в слое
                                tf.random_normal_initializer(0, 0.05))
            
            x = tf.matmul(x, v1)
            
            b1 = tf.get_variable('b1', [NEURON_NUMBER], tf.float32, tf.constant_initializer()) #смещение
            
            scaler = g1 / tf.sqrt(tf.reduce_sum(tf.square(v1),[0]))
            x = tf.reshape(scaler,[1, NEURON_NUMBER])*x + tf.reshape(b1, [1, NEURON_NUMBER])
            x = tf.nn.relu(x)
            
            # layer 2
            v2 = tf.get_variable('v2', [NEURON_NUMBER, NEURON_NUMBER], tf.float32,
                                tf.random_normal_initializer(0, 0.05))
            
            x = tf.matmul(x, v2)
            
            b2 = tf.get_variable('b2', [NEURON_NUMBER], tf.float32, tf.constant_initializer())
            
            scaler = g2 / tf.sqrt(tf.reduce_sum(tf.square(v2),[0]))
            x = tf.reshape(scaler,[1, NEURON_NUMBER])*x + tf.reshape(b2, [1, NEURON_NUMBER])
            x = tf.nn.relu(x)
            
            # layer 3
            v3 = tf.get_variable('v3', [NEURON_NUMBER, NUM_CLASSES], tf.float32, 
                                tf.random_normal_initializer(0, 0.05))
            
            x = tf.matmul(x, v3)
            
            b3 = tf.get_variable('b3', [NUM_CLASSES], tf.float32, tf.constant_initializer()) 
            
            scaler = g3 / tf.sqrt(tf.reduce_sum(tf.square(v3),[0]))
            x = tf.reshape(scaler,[1, NUM_CLASSES])*x + tf.reshape(b3, [1, NUM_CLASSES])

        ops['logits'] = x
        
        return ops
        


In [98]:
# hypernetwork, чтобы генерить веса (генеративная модель)
def get_h_net(units=[64, 256], num_noise=29,NUM_CLASSES=2,NEURON_NUMBER=800):
    with tf.variable_scope('h_net'):
        # auxiliary conditioning
        w1_c = tf.constant([1., 0., 0.])
        w2_c = tf.constant([0., 1., 0.])
        w3_c = tf.constant([0., 0., 1.])

        #auxiliary noise
        noise = tf.random_normal((num_noise, ))

        w1_z = tf.reshape(tf.concat([w1_c, noise], 0), (1, num_noise + 3))
        w2_z = tf.reshape(tf.concat([w2_c, noise], 0), (1, num_noise + 3))
        w3_z = tf.reshape(tf.concat([w3_c, noise], 0), (1, num_noise + 3))
        
        w_z = tf.concat([w1_z, w2_z, w3_z], 0)
        
        z = w_z
        
        for unit in units:
            z = tf.layers.dense(inputs=z, units=unit)
            z = tf.nn.elu(z)
        
        z = tf.layers.dense(inputs=w_z, units=NEURON_NUMBER)

        w1 = z[0, :]
        w2 = z[1, :]

        w3 = z[2, :NUM_CLASSES]
        
        return [w1, w2, w3, tf.reshape(tf.concat([w1, w2, w3], 0), (2*NEURON_NUMBER+NUM_CLASSES, 1))]

In [99]:
#дискриминативная модель
def get_d_net(gens, units=[20, 20],NUM_CLASSES=2,NEURON_NUMBER=800):
    with tf.variable_scope('d_net'):
        
        ds = tf.contrib.distributions
        mix = 0.7
        bimix_gauss = ds.Mixture(
          cat=ds.Categorical(probs=[mix, 1.-mix]),
          components=[
            ds.Normal(loc=0., scale=0.01),
            ds.Normal(loc=0., scale=5.),
        ])
        
        noise = bimix_gauss.sample((NEURON_NUMBER + NEURON_NUMBER + NUM_CLASSES, 1))
        
        all_t = tf.concat((gens, noise), 0)
        
        d = all_t
        
        for unit in units:
            d = tf.layers.dense(inputs=d, units=unit )
            d = tf.nn.relu(d)
        
        d = tf.layers.dense(inputs=d, units=1)
        
        return d[:NEURON_NUMBER + NEURON_NUMBER + NUM_CLASSES], d[NEURON_NUMBER + NEURON_NUMBER +NUM_CLASSES:]

In [100]:
# адаптированная под специфику данной задачи метод tensorflow -- next_batch
def next_batch(data:"np array", labels:"np array", batch_size:int, shuffle=False):
    
    #если наблюдения независимы, что в нашем случае НЕ ТАК
    if shuffle:
        # Shuffle data
        shuffle_indices = np.random.permutation(np.arange(len(labels)))
        data = data[shuffle_indices]
        labels = labels[shuffle_indices]

    for batch_i in range(0, len(data)//batch_size):
        start_i = batch_i * batch_size
        data_batch = data[start_i:start_i + batch_size]
        labels_batch = labels[start_i:start_i + batch_size]

        yield np.array(data_batch), np.array(labels_batch)

In [101]:
def make_computations(X,y,Xtest,n_epochs=200):
    #Bayes NN
    num_noise = 125

    layers = [64, 256]

    mode = 'implicit'

    batch_size = 128

    tf.reset_default_graph()

    ops = {}

    if mode == 'implicit':
        w1, w2, w3, gens = get_h_net(num_noise=num_noise, units=layers)

        g_d, n_d = get_d_net(gens)

        ops = {'g1': w1, 'g2': w2, 'g3': w3}

        d_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'd_net')
        g_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'h_net')

    # get network ops
    ops = get_net(mode, ops, Xtr_shape=np.array(X.shape).astype(np.int32))

    net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'net')

    # оптимизируемая функция потерь -- log-loss
    ce = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=ops['logits'], labels=ops['y']))

    # метод оптимизации
    opt = tf.train.AdamOptimizer(0.001,epsilon=1e-5)

    if mode == 'implicit':
        loss_d = (- tf.reduce_mean(tf.log(1 - tf.nn.sigmoid(n_d) + 1e-8, name='log_n_d'))
              - tf.reduce_mean(tf.log(tf.nn.sigmoid(g_d) + 1e-8, name='log_g_d')))

        gvs = opt.compute_gradients(loss_d, var_list=d_vars)
        capped_gvs = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gvs if grad is not None]
        d_optimiser = opt.apply_gradients(capped_gvs)

        g_logits_m = tf.reduce_mean(g_d)

        loss_g = g_logits_m + len(y) / float(batch_size) * ce

        g_optimiser = opt.minimize(loss_g, var_list=g_vars+net_vars)

        d_n_acc = tf.reduce_mean(tf.cast(tf.nn.sigmoid(n_d) < 0.5, tf.float32))
        d_g_acc = tf.reduce_mean(tf.cast(tf.nn.sigmoid(g_d) >= 0.5, tf.float32))


    # Функция, хранящая предсказания 
    pred = tf.argmax(ops['logits'], -1, output_type=tf.int32)

    # Тензор, чтобы вычислить точность предсказаний
    acc = tf.reduce_mean(tf.cast(tf.equal(pred, ops['y']), tf.float32))

    probs = tf.nn.softmax(ops['logits'])

    # для инициализации переменных
    init = tf.global_variables_initializer()

    numerics = tf.add_check_numerics_ops()

    # генератор для циклического прохода по батчам
    generator = cycle(next_batch(data=X,labels=y,batch_size=batch_size,shuffle=True))

    s = tf.Session()

    # Инициализация весов
    s.run(init)

    print("Xtrain shape: {}".format(X.shape))
    print("GPU: {}".format(tf.test.is_gpu_available()))

    # Запуск дискриминатора, чтобы давать лучшие значения градиентов
    if mode == 'implicit':
        for _ in range(300):
            s.run(d_optimiser)

    with trange(n_epochs * 400) as pbar: # проведём ~ n эпох (первый аргумент)
        for i in pbar:
            # получаем батч 
            b = batch_xs, batch_ys = next(generator) 
        
            if mode == 'implicit':
                #Запуск дискриминатора, чтобы давать лучшие значения градиентов
                for _ in range(20):
                    s.run(d_optimiser)

                np_acc, d_loss, g_loss, l_loss, np_d, np_g, _, _ = s.run([acc, loss_d, g_logits_m, ce,
                                                                  d_n_acc, d_g_acc, g_optimiser, numerics],
                                                                 feed_dict={ops['x']: batch_xs,
                                                                            ops['y']: batch_ys})

                pbar.set_postfix(acc=np_acc, d_loss=d_loss, g_loss=g_loss,
                             l_loss=l_loss, d_n_acc=np_d, d_g_acc=np_g)
            elif mode == 'bbb':
                np_acc, l_loss, kl_loss, _ = s.run([acc, ce, ops['kl_loss'], optimiser],
                                      feed_dict={ops['x']: batch_xs, ops['y']: batch_ys})
                pbar.set_postfix(acc=np_acc, ce=l_loss, kl_loss=kl_loss)
            else:
                np_acc, l_loss, _ = s.run([acc, ce, optimiser],
                                      feed_dict={ops['x']: batch_xs, ops['y']: batch_ys})

                pbar.set_postfix(acc=np_acc, ce=l_loss)
                
    NUM_CLASSES = 2
            
    bnn_probs = np.zeros((Xtest.shape[0], NUM_CLASSES))
    mc_steps = 300
    for _ in trange(mc_steps):
        bnn_probs += s.run(probs, feed_dict={ops['x']: Xtest})

    bnn_probs /= mc_steps
    bnn_preds = np.argmax(bnn_probs, -1)
    
    return bnn_probs,bnn_preds

In [102]:
import gc
gc.collect()

0

In [103]:
losses = []
predictions = {'id': data_test['id']}

In [104]:
for class_name in class_names:
    
    train_target = data_train[class_name].values
    
    train_arrays_train,train_arrays_valid,train_target_tr, valid_target = \
                    train_test_split(train_arrays,train_target,test_size=0.25,shuffle=True,random_state=7)
    
    bnn_probs, bnn_preds = make_computations(X=train_arrays_train,y=train_target_tr,Xtest=train_arrays_valid,n_epochs=200)
    
    #площадь под roc-кривой на валидационной выборке
    fpr,tpr,_ = roc_curve(valid_target, bnn_probs[:, 1])
    roc_auc = auc(fpr, tpr)
    losses.append(roc_auc)
    
    print('Hold-Out score for class {} is {}'.format(class_name,roc_auc))
    predictions[class_name] = bnn_preds 

    
print('Total CV score is {}'.format(np.mean(losses)))

Xtrain shape: (119678, 100)
GPU: True


100%|██████████| 80000/80000 [52:07<00:00, 25.58it/s, acc=0.992, d_g_acc=0.979, d_loss=0.664, d_n_acc=0.758, g_loss=1.29, l_loss=0.0111] 
100%|██████████| 300/300 [00:10<00:00, 28.49it/s]


Hold-Out score for class toxic is 0.9543874007257592
Xtrain shape: (119678, 100)
GPU: True


 16%|█▌        | 12760/80000 [08:20<47:27, 23.61it/s, acc=1, d_g_acc=0.96, d_loss=0.484, d_n_acc=0.878, g_loss=2.13, l_loss=0.0102]      


KeyboardInterrupt: 

Идеи для дальнейшего улучшения:
- oversampling or undersampling
- тюнинг гиперпараметров
- стемминг
- фильтры слов
- блендинг
- стэкинг
- см hotness кернелы и discussion
- см канал в ods по этому соревнованию
- увеличить размерность векторного представления документа
- balanced classes,data augmentation
- try to increase num of epochs for doc2vec

# Lasso Classifier

In [129]:
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsClassifier
import gc
gc.collect()

78

In [127]:
losses = []
predictions = {'id': data_test['id']}

In [None]:
for class_name in class_names:
    train_target = data_train[class_name]
    classifier = KNeighborsClassifier()

    cv_loss = np.mean(cross_val_score(classifier, train_arrays, train_target, cv=skf, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))

    classifier.fit(train_arrays, train_target)
    predictions[class_name] = classifier.predict_proba(test_arrays)[:, 1]

print('Total CV score is {}'.format(np.mean(losses))) #MEMORY ERROR


# [CNN GLOVE300 3-OOF 4 epochs](https://www.kaggle.com/tunguz/cnn-glove300-3-oof-4-epochs/code)


In [23]:
import gc
gc.collect()

0

In [24]:
# Fork of Sergei Fironov's script CNN GLOVE300 3-OOF 3 epochs

import os
os.environ['OMP_NUM_THREADS'] = '4'

import tensorflow as tf
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Dense, Embedding, Input, Concatenate, Conv1D, Activation, TimeDistributed, Flatten, RepeatVector, Permute,multiply
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout, GRU, GlobalAveragePooling1D, MaxPooling1D, SpatialDropout1D, BatchNormalization
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re 
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

print('loading embeddings vectors')
def get_coefs(word,*arr):
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(' ')) for o in open('glove.840B.300d.txt'))

min_count = 10 #the minimum required word frequency in the text
max_features = 27403 #it's from previous run with min_count=10
maxlen = 100 #padding length
num_folds = 3 #number of folds
batch_size = 512 
epochs = 4
embed_size = 300 #embeddings dimension

sia = SentimentIntensityAnalyzer()

#train = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
#test = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")

#list_sentences_train = train["comment_text"].fillna("").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = data_train[list_classes].values
#list_sentences_test = test["comment_text"].fillna("").values

prep_c_train = np.array(prep_c_train)
prep_c_test = np.array(prep_c_test)
#print('mean text len:',prep_c_train.str.count('\S+').mean())
#print('max text len:',prep_c_test.str.count('\S+').max())

#tokenizer = Tokenizer()
#tokenizer.fit_on_texts(list(list_sentences_train)) #  + list(list_sentences_test)
#num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])
#print('num_words',num_words)
#max_features = num_words
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(prep_c_train)) # + list(list_sentences_test)

list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
print('padding sequences')
X_train = {}
X_test = {}
X_train['text'] = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen, padding='post', truncating='post')
X_test['text'] = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen, padding='post', truncating='post')

print('numerical variables')
data_train['num_words'] = data_train.comment_text.str.count('\S+')
data_test['num_words'] = data_test.comment_text.str.count('\S+')
data_train['num_comas'] = data_train.comment_text.str.count('\.')
data_test['num_comas'] = data_test.comment_text.str.count('\.')
data_train['num_bangs'] = data_train.comment_text.str.count('\!')
data_test['num_bangs'] = data_test.comment_text.str.count('\!')
data_train['num_quotas'] = data_train.comment_text.str.count('\"')
data_test['num_quotas'] = data_test.comment_text.str.count('\"')
data_train['avg_word'] = data_train.comment_text.str.len() / (1 + data_train.num_words)
data_test['avg_word'] = data_test.comment_text.str.len() / (1 + data_test.num_words)
#print('sentiment')
#train['sentiment'] = train.comment_text.apply(lambda s : sia.polarity_scores(s)['compound'])
#test['sentiment'] = test.comment_text.apply(lambda s : sia.polarity_scores(s)['compound'])
scaler = MinMaxScaler()
X_train['num_vars'] = scaler.fit_transform(data_train[['num_words','num_comas','num_bangs','num_quotas','avg_word']])
X_test['num_vars'] = scaler.transform(data_test[['num_words','num_comas','num_bangs','num_quotas','avg_word']])

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

print('create embedding matrix')
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

def get_model_cnn(X_train):
    global embed_size
    inp = Input(shape=(maxlen, ), name="text")
    num_vars = Input(shape=[X_train["num_vars"].shape[1]], name="num_vars")
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    z = GlobalMaxPool1D()(x)
    x = GlobalMaxPool1D()(Conv1D(embed_size, 4, activation="relu")(x))
    x = Concatenate()([x,z,num_vars])
    x = Dropout(0.3)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=[inp,num_vars], outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model        

print('start modeling')
scores = []
predict = np.zeros((test.shape[0],6))
oof_predict = np.zeros((train.shape[0],6))

kf = KFold(n_splits=num_folds, shuffle=True, random_state=239)
for train_index, test_index in kf.split(X_train['num_vars']):
    kfold_X_train = {}
    kfold_X_valid = {}
    y_train,y_test = y[train_index], y[test_index]
    for c in ['text','num_vars']:
        kfold_X_train[c] = X_train[c][train_index]
        kfold_X_valid[c] = X_train[c][test_index]

    model = get_model_cnn(X_train)
    model.fit(kfold_X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1)
    predict += model.predict(X_test, batch_size=1000) / num_folds
    oof_predict[test_index] = model.predict(kfold_X_valid, batch_size=1000)
    cv_score = roc_auc_score(y_test, oof_predict[test_index])
    scores.append(cv_score)
    print('score: ',cv_score)

print('Total CV score is {}'.format(np.mean(scores)))    


sample_submission = pd.DataFrame.from_dict({'id': test['id']})
oof = pd.DataFrame.from_dict({'id': train['id']})
for c in list_classes:
    oof[c] = np.zeros(len(train))
    sample_submission[c] = np.zeros(len(test))
    
sample_submission[list_classes] = predict
sample_submission.to_csv('submit_cnn_avg_' + str(num_folds) + '_folds.csv', index=False)

oof[list_classes] = oof_predict
oof.to_csv('cnn_'+str(num_folds)+'_oof.csv', index=False)

#Memory error

loading embeddings vectors


NameError: name 'list_sentences_train' is not defined

In [79]:
np.array(prep_c_train).str.count('\S+').mean()

MemoryError: 

# GRU like [here](https://github.com/PavelOstyakov/toxic)

# Mixing

In [4]:
bojan = pd.read_csv('logistic_regression_with_words_and_char_n_grams.csv')
gru_ostyakov_with_my_preprocessing = pd.read_csv("submit_with_kost_prep.csv") 

In [9]:
myd2v = pd.read_csv('not_tuned_log_reg_on_d2v_output.csv')

In [21]:
#https://www.kaggle.com/jhoward/minimal-lstm-nb-svm-baseline-ensemble
pub2 = pd.read_csv("Minimal_LSTM + NB-SVM.csv")

In [7]:
gru_ostyakov_with_my_preprocessing = gru_ostyakov_with_my_preprocessing[["id","identity_hate","insult","obscene","severe_toxic","threat","toxic"]]

In [44]:
probas=pd.DataFrame(0.85*gru_ostyakov_with_my_preprocessing.values[:,1:]+0.15*myd2v.values[:,1:],columns=["identity_hate","insult","obscene","severe_toxic","threat","toxic"])

In [45]:
probas=pd.DataFrame(0.85*gru_ostyakov_with_my_preprocessing.values[:,1:]+0.15*myd2v.values[:,1:],columns=["identity_hate","insult","obscene","severe_toxic","threat","toxic"])
pd.concat([myd2v.id,probas],1).to_csv("w_mean_myd2v_gru_ostyakov_with_my_preprocessing.csv",index=False)

In [37]:
MAX_of_gru_ostyakov_with_my_preprocessing_and_myd2v = []

In [38]:
for ost, my in zip(gru_ostyakov_with_my_preprocessing.values[:,1:],myd2v.values[:,1:]):
    MAX_of_gru_ostyakov_with_my_preprocessing_and_myd2v.append(np.max(np.vstack((ost,my)),axis=0))

In [40]:
MAX_of_gru_ostyakov_with_my_preprocessing_and_myd2v = np.array(MAX_of_gru_ostyakov_with_my_preprocessing_and_myd2v)

In [42]:
probas=pd.DataFrame(MAX_of_gru_ostyakov_with_my_preprocessing_and_myd2v,columns=["identity_hate","insult","obscene","severe_toxic","threat","toxic"])

In [43]:
pd.concat([myd2v.id,probas],1).to_csv("max_of_myd2v_gru_ostyakov_with_my_preprocessing.csv",index=False)

### [url](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/kernels?sortBy=score-desc&group=everyone&pageSize=20&competitionId=8076)

In [5]:
hight_of_blend = pd.read_csv("hight_of_blend_v2.csv")

In [14]:
probas=pd.DataFrame(0.55*hight_of_blend.values[:,1:]+0.45*gru_ostyakov_with_my_preprocessing.values[:,1:],columns=["identity_hate","insult","obscene","severe_toxic","threat","toxic"])
pd.concat([myd2v.id,probas],1).to_csv("w_mean_hight_of_blend_ostyakov_gru_with_preprocessing.csv",index=False)

In [16]:
 best = pd.read_csv("w_mean_hight_of_blend_myd2v_ostyakov_lstm.csv")

In [17]:
probas=pd.DataFrame(0.5*best.values[:,1:]+0.5*gru_ostyakov_with_my_preprocessing.values[:,1:],columns=["identity_hate","insult","obscene","severe_toxic","threat","toxic"])
pd.concat([myd2v.id,probas],1).to_csv("w_mean_best_gru_with_preprocessing.csv",index=False)