In [129]:
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K
from nltk import word_tokenize, sent_tokenize
from collections import defaultdict, Counter
from string import punctuation
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.cluster import MeanShift, DBSCAN
from sklearn.decomposition import PCA, TruncatedSVD

### Описательный анализ

Для этой части выбрала пункт за 1.75 балла (Описательный анализ тестовой выборки в сравнение с обучающей - какие языки представлены, как они по статистикам отличаются от трейна).

In [2]:
toxic_full = pd.read_csv('jigsaw-toxic-comment-train.csv')

In [3]:
toxic = toxic_full.sample(frac=.3)

In [13]:
toxic.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
202401,aac04d8feb3e4b39,""" \n :Left more detail under the review at Tal...",0,0,0,0,0,0
84162,e12a43cefe51d6fe,"""\n\nSpeedy deletion of """"Shrini""""\n A page yo...",0,0,0,0,0,0
62253,a6956e341824c2b4,")\na cowards site, that must stop changing thi...",1,0,1,0,0,0
113431,5e98a73ecce25fcd,Arg. \n\nTHis shit is not CREDIBLE!,1,0,1,0,0,0
56104,95ebdc00350058cb,dhdhhdfh \n\ndeleting an account isnt going to...,0,0,0,0,0,0
75437,c9cf6a055dfac6ee,"""\n Comment redacted. We'll all be reduced to ...",0,0,0,0,0,0
137334,dece133d021adb75,"""\n\n The borders of the new state were not sp...",0,0,0,0,0,0
59050,9e2545dd971e7ed0,}}\n{{Old AfD multi|page=Wee Shu Min elitism c...,0,0,0,0,0,0
60829,a2d9eab4c472e2cb,"Of course, it's an OR review of mine, just to ...",0,0,0,0,0,0
107437,3e571977469fe055,"I don't give a damn about religion, but it's o...",1,0,0,0,0,0


In [4]:
toxic_test = pd.read_csv('test.csv')

In [5]:
toxic_test.head(10)

Unnamed: 0,id,content,lang
0,0,Doctor Who adlı viki başlığına 12. doctor olar...,tr
1,1,"Вполне возможно, но я пока не вижу необходимо...",ru
2,2,"Quindi tu sei uno di quelli conservativi , ...",it
3,3,Malesef gerçekleştirilmedi ancak şöyle bir şey...,tr
4,4,:Resim:Seldabagcan.jpg resminde kaynak sorunu ...,tr
5,5,Le truc le plus important dans ta tirade c est...,fr
6,6,"20px Caro editor, encontramos problemas na edi...",pt
7,7,el skate es unos de los deportes favoritos de ...,es
8,8,Me doy la bienvenida. A este usuari le gusta c...,es
9,9,"ES NOTABLEMENTE TENDENCIOSO, NO SE HABLA DE CU...",es


Посчитаем статистики.

In [15]:
def words_count(text):
    text = word_tokenize(text)
    text = [word.strip(punctuation) for word in text]
    text = [word for word in text if word != '']
    return len(text)
        
def sents_count(text):
    text = sent_tokenize(text)
    return len(text)
    
def chars_count(text):
    chars = [x for x in text if x not in punctuation]
    return len(chars)
    
def mean_word_length(text):
    text = word_tokenize(text)
    text = [word.strip(punctuation) for word in text]
    text = [word for word in text if word != '']
    lens = [len(x) for x in text]
    return round(np.mean(lens), 2)
    
def mean_sent_length(text):
    text = sent_tokenize(text)
    lens = [len(word_tokenize(s)) for s in text]
    return round(np.mean(lens), 2)

def caps_percentage(text):
    text_len = chars_count(text)
    caps = 0
    for l in text:
        if l != ' ' and l not in punctuation:
            if l != l.lower():
                caps += 1
    if text_len > 0:
        return round(caps / text_len * 100, 2)
    else:
        return 0
    
def punctuation_percentage(text):
    text_len = chars_count(text)
    punc = 0
    for l in text:
        if l in punctuation:
            punc += 1
    if text_len > 0:    
        return round(punc / text_len * 100, 2)
    else:
        return 0
    
def word_repeat_ratio(text):
    text = word_tokenize(text)
    text = [word.strip(punctuation) for word in text]
    words = []
    repetitions = []
    for word in text:
        if word not in words:
            words.append(word)
        else:
            if word not in repetitions:
                repetitions.append(word)
    if len(words) > 0:
        return round(len(repetitions) / len(words) * 100, 2)
    else:
        return 0
    
def character_repeat(text):
    text = word_tokenize(text)
    text = [word.strip(punctuation) for word in text]
    text = [word for word in text if word != '']
    words_with_repetitions = 0
    for word in text:
        res = re.findall(r'(\w)\1+', word)
        if res != []:
            words_with_repetitions += 1
    if len(text) > 0:
        return round(words_with_repetitions / len(text) * 100, 2)
    else:
        return 0
    
def punctuation_repeat(text):
    res = re.findall("[?!.]+", text)
    length = 0
    for r in res:
        if len(r) > length:
            length = len(r)
    return length

Обучающая выборка:

In [None]:
toxic['words_count'] = toxic['comment_text'].apply(words_count)
toxic['sents_count'] = toxic['comment_text'].apply(sents_count) 
toxic['chars_count'] = toxic['comment_text'].apply(chars_count)   
toxic['mean_word_length'] = toxic['comment_text'].apply(mean_word_length) 
toxic['mean_sent_length'] = toxic['comment_text'].apply(mean_sent_length) 
toxic['caps_percentage'] = toxic['comment_text'].apply(caps_percentage)   
toxic['punctuation_percentage'] = toxic['comment_text'].apply(punctuation_percentage) 
toxic['word_repeat_ratio'] = toxic['comment_text'].apply(word_repeat_ratio)  
toxic['character_repeat'] = toxic['comment_text'].apply(character_repeat)   
toxic['punctuation_repeat'] = toxic['comment_text'].apply(punctuation_repeat)    

In [17]:
toxic.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,words_count,sents_count,chars_count,mean_word_length,mean_sent_length,caps_percentage,punctuation_percentage,word_repeat_ratio,character_repeat,punctuation_repeat
202401,aac04d8feb3e4b39,""" \n :Left more detail under the review at Tal...",0,0,0,0,0,0,13,1,86,5.38,19.0,9.3,9.3,7.14,7.69,1
84162,e12a43cefe51d6fe,"""\n\nSpeedy deletion of """"Shrini""""\n A page yo...",0,0,0,0,0,0,123,7,730,4.88,20.71,1.23,3.15,26.44,8.13,1
62253,a6956e341824c2b4,")\na cowards site, that must stop changing thi...",1,0,1,0,0,0,81,6,410,4.02,17.17,28.78,5.85,14.29,6.17,3
113431,5e98a73ecce25fcd,Arg. \n\nTHis shit is not CREDIBLE!,1,0,1,0,0,0,6,2,31,4.0,4.0,35.48,6.45,14.29,0.0,1
56104,95ebdc00350058cb,dhdhhdfh \n\ndeleting an account isnt going to...,0,0,0,0,0,0,9,1,56,5.11,10.0,0.0,1.79,0.0,22.22,1
75437,c9cf6a055dfac6ee,"""\n Comment redacted. We'll all be reduced to ...",0,0,0,0,0,0,49,3,249,4.08,19.33,4.82,5.22,19.05,12.24,1
137334,dece133d021adb75,"""\n\n The borders of the new state were not sp...",0,0,0,0,0,0,87,7,472,4.34,15.29,3.81,4.45,43.24,2.3,1
59050,9e2545dd971e7ed0,}}\n{{Old AfD multi|page=Wee Shu Min elitism c...,0,0,0,0,0,0,9,1,79,8.44,13.0,8.86,12.66,10.0,22.22,0
60829,a2d9eab4c472e2cb,"Of course, it's an OR review of mine, just to ...",0,0,0,0,0,0,109,7,601,4.55,18.0,2.5,3.16,24.42,8.26,1
107437,3e571977469fe055,"I don't give a damn about religion, but it's o...",1,0,0,0,0,0,60,2,327,4.55,33.0,2.75,3.06,22.92,1.67,1


Тестовая выборка:

In [None]:
toxic_test['words_count'] = toxic_test['content'].apply(words_count)
toxic_test['sents_count'] = toxic_test['content'].apply(sents_count) 
toxic_test['chars_count'] = toxic_test['content'].apply(chars_count)
toxic_test['mean_word_length'] = toxic_test['content'].apply(mean_word_length) 
toxic_test['mean_sent_length'] = toxic_test['content'].apply(mean_sent_length) 
toxic_test['caps_percentage'] = toxic_test['content'].apply(caps_percentage)   
toxic_test['punctuation_percentage'] = toxic_test['content'].apply(punctuation_percentage) 
toxic_test['word_repeat_ratio'] = toxic_test['content'].apply(word_repeat_ratio)  
toxic_test['character_repeat'] = toxic_test['content'].apply(character_repeat)  
toxic_test['punctuation_repeat'] = toxic_test['content'].apply(punctuation_repeat)    

In [20]:
toxic_test.head(10)

Unnamed: 0,id,content,lang,words_count,sents_count,chars_count,mean_word_length,mean_sent_length,caps_percentage,punctuation_percentage,word_repeat_ratio,character_repeat,punctuation_repeat
0,0,Doctor Who adlı viki başlığına 12. doctor olar...,tr,19,4,137,6.21,5.5,3.65,2.92,10.53,10.53,1
1,1,"Вполне возможно, но я пока не вижу необходимо...",ru,76,6,456,5.01,14.5,3.73,2.63,15.87,3.95,1
2,2,"Quindi tu sei uno di quelli conservativi , ...",it,49,7,279,4.49,9.29,3.58,7.89,19.05,20.41,3
3,3,Malesef gerçekleştirilmedi ancak şöyle bir şey...,tr,81,13,629,6.43,7.62,2.38,3.02,20.9,9.88,1
4,4,:Resim:Seldabagcan.jpg resminde kaynak sorunu ...,tr,175,17,1513,7.35,12.0,1.85,2.18,35.71,9.71,1
5,5,Le truc le plus important dans ta tirade c est...,fr,32,3,161,3.91,11.33,1.86,1.24,13.33,0.0,1
6,6,"20px Caro editor, encontramos problemas na edi...",pt,70,6,434,5.06,12.83,3.92,2.07,21.57,2.86,1
7,7,el skate es unos de los deportes favoritos de ...,es,27,1,142,4.3,27.0,0.0,0.0,20.0,3.7,0
8,8,Me doy la bienvenida. A este usuari le gusta c...,es,29,4,145,4.07,9.0,4.14,5.52,3.33,0.0,1
9,9,"ES NOTABLEMENTE TENDENCIOSO, NO SE HABLA DE CU...",es,48,1,267,4.58,53.0,82.02,1.87,23.68,2.08,1


Сравним средние статистик по двум выборкам.

In [27]:
comparison = pd.DataFrame({'dataset':['train', 'test'],
                          'words_count':[toxic['words_count'].mean(), toxic_test['words_count'].mean()],
                          'sents_count':[toxic['sents_count'].mean(), toxic_test['sents_count'].mean()],
                          'chars_count':[toxic['chars_count'].mean(), toxic_test['chars_count'].mean()],
                          'mean_word_length':[toxic['mean_word_length'].mean(), toxic_test['mean_word_length'].mean()],
                          'mean_sent_length':[toxic['mean_sent_length'].mean(), toxic_test['mean_sent_length'].mean()],
                          'caps_percentage':[toxic['caps_percentage'].mean(), toxic_test['caps_percentage'].mean()],
                          'punctuation_percentage':[toxic['punctuation_percentage'].mean(), toxic_test['punctuation_percentage'].mean()],
                          'word_repeat_ratio':[toxic['word_repeat_ratio'].mean(), toxic_test['word_repeat_ratio'].mean()],
                          'character_repeat':[toxic['character_repeat'].mean(), toxic_test['character_repeat'].mean()],
                          'punctuation_repeat':[toxic['punctuation_repeat'].mean(), toxic_test['punctuation_repeat'].mean()]
                          })

In [28]:
comparison

Unnamed: 0,dataset,words_count,sents_count,chars_count,mean_word_length,mean_sent_length,caps_percentage,punctuation_percentage,word_repeat_ratio,character_repeat,punctuation_repeat
0,train,66.570148,4.301648,372.244524,4.656768,18.462214,5.585575,6.070397,17.015188,9.833525,1.476568
1,test,58.186172,4.472638,365.822557,5.271352,18.248322,4.521486,3.72261,13.515397,7.366358,1.635586


Заметно, что на тестовой выборке несколько меньше среднее количество слов и символов в комментарии, процент капса, пунктуации, повторяющихся слов и символов. Незначительно больше у неё значение только в средней длине комментария в предложениях.

Теперь посмотрим на языки, представленные в тестовой выборке.

In [30]:
toxic_test['lang'].value_counts()

tr    14000
pt    11012
ru    10948
fr    10920
it     8494
es     8438
Name: lang, dtype: int64

Больше всего комментариев на турецком, меньше всего — на испанском. 
Посмотрим, различаются ли как-то метрики между языками.

In [31]:
toxic_test.groupby('lang').agg(['mean'])

Unnamed: 0_level_0,id,words_count,sents_count,chars_count,mean_word_length,mean_sent_length,caps_percentage,punctuation_percentage,word_repeat_ratio,character_repeat,punctuation_repeat
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
lang,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
es,32405.344039,52.685945,3.1799,298.016947,4.663715,23.455702,6.975967,3.348764,15.611833,3.68733,1.655487
fr,31676.724176,67.026282,4.105495,376.191758,4.567502,21.315888,3.796771,3.591158,16.221549,8.545807,1.645513
it,31916.568048,57.35943,3.755121,340.656463,4.876001,20.535257,3.87312,3.973826,13.380492,15.59778,1.956911
pt,31795.557937,63.962768,4.818289,381.856974,4.910379,17.569461,4.819306,4.205549,15.268585,5.302015,1.750363
ru,31952.666423,53.267811,4.678754,349.104859,5.503438,15.687345,3.929187,4.281545,11.442157,6.042558,1.454238
tr,31825.56,54.41,5.540429,414.331643,6.52889,13.8662,4.229705,3.081095,10.465159,6.328634,1.472429


Из любопытных наблюдений можно отметить, что самые длинные комментарии оставляли на турецком, но при этом средняя длина предложения у них самая низкая, а у испанского языка ситуация диаметрально противоположная. Кроме того, в итальянском значительно чаще, чем где-либо еще, встречались повторения символов, а также капс.

In [52]:
toxic.to_csv(r'toxic_sample.csv', index = False)

### Baseline модель

Для этой части выбрала пункт за 1.5 балла (бейзлайн модель из sklearn (векторайзер + модель) c подбором параметров в grid_search (как минимум 10 параметров)).

In [119]:
X = toxic.comment_text.values

In [120]:
y = toxic.toxic.values

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=5)

In [26]:
count_vect = CountVectorizer(max_features=10000, min_df=0.01, max_df=0.4)

In [130]:
X_train_cv = count_vect.fit_transform(X_train)
X_test_cv = count_vect.transform(X_test)

In [123]:
def bestparams(model, grid, folds, data, classes):
    grid_search = GridSearchCV(model, param_grid=grid, cv=folds, scoring='f1_macro')
    grid_search.fit(data, classes) 
    return grid_search.best_score_, grid_search.best_params_, grid_search.best_estimator_

In [124]:
models = [LogisticRegression(), LinearSVC(), SGDClassifier()]

In [125]:
grids = [{'class_weight' : ['balanced', None], 'C': [0.0001, 0.001, 0.001, 0.01, 1, 10,
                                                    100, 1000], 'max_iter': [500, 700]},
        {'loss' : ['hinge', 'squared_hinge'], 'C': [0.0001, 0.001, 0.001, 0.01, 1, 10,
                                                    100, 1000], 'intercept_scaling' : [1, 2]},
        {'alpha': [0.0001, 0.05, 0.1], 'max_iter': [200, 300]}]

In [126]:
n_fold = 6
folds = KFold(n_splits=n_fold, shuffle=True, random_state=0)

In [127]:
chosenmodels = []
trainscores = []

In [131]:
for i in range(3):
    best_score, best_params, best_estimator = bestparams(models[i], grids[i], folds, X_train_cv, y_train)
    print('Best score is {}'.format(best_score))
    print('Best parameters are {}'.format(best_params))
    trainscores.append(best_score)
    chosenmodels.append(best_estimator)

Best score is 0.7214350733537795
Best parameters are {'C': 100, 'class_weight': None, 'max_iter': 500}




Best score is 0.7219616766814392
Best parameters are {'C': 10, 'intercept_scaling': 2, 'loss': 'squared_hinge'}
Best score is 0.7132063761331536
Best parameters are {'alpha': 0.0001, 'max_iter': 200}


Самый лучший результат показала модель с LinearSVC и параметрами C=10, intercept_scaling=2, loss=squared_hinge.

### Ансамбли

Здесь выбрала часть за 2 балла (ансамбль из моделей в sklearn).

In [151]:
clf1 = MultinomialNB()
clf2 = LogisticRegression(C=100, class_weight=None, max_iter=500)
clf3 = GaussianNB()
clf4 = SGDClassifier(loss='log', alpha=0.0001, max_iter=200)
clf5 = DecisionTreeClassifier()

In [152]:
eclf = VotingClassifier(estimators=[('clf1', clf1), ('clf2', clf2), ('clf3', clf3), ('clf4', clf4),
                                    ('clf5', clf5)], voting='soft')

In [153]:
%%time
voting = Pipeline([
    ('vect', CountVectorizer(analyzer='word', max_features=500)),
    ('tfidf', TfidfTransformer(sublinear_tf=True)),
    ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), 
    ('clf', eclf),
    ])
voting = voting.fit(X_train[:20000], y_train[:20000])

CPU times: user 17.4 s, sys: 205 ms, total: 17.6 s
Wall time: 16 s


In [154]:
predict = voting.predict(X_test)

In [156]:
print("Precision: {0:6.2f}".format(precision_score(y_test, predict, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test, predict, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test, predict, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, predict)))

Precision:   0.82
Recall:   0.72
F1-measure:   0.76
Accuracy:   0.93


### NN

Любая нейронная модель (минимум 5 слоев) с Dropout, Pooling и колбеками - 2 балла.

In [5]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):

        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):

        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [6]:
def preprocess(text):
    tokens = text.lower().split()
    tokens = [token.strip(punctuation) for token in tokens]
    return tokens

In [7]:
vocab = Counter()
for text in toxic['comment_text']:
    vocab.update(preprocess(text))

In [8]:
filtered_vocab = set()
for word in vocab:
    if vocab[word] > 2:
        filtered_vocab.add(word)

In [9]:
word2id = {'UNK':1, 'PAD':0}

for word in filtered_vocab:
    word2id[word] = len(word2id)

In [10]:
id2word = {i:word for word, i in word2id.items()}

In [11]:
X = []

for text in toxic['comment_text']:
    tokens = preprocess(text)
    ids = [word2id.get(token, 1) for token in tokens]
    X.append(ids)

In [12]:
MAX_LEN = max(len(x) for x in X)

In [13]:
MEAN_LEN = np.median([len(x) for x in X])

In [14]:
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=MAX_LEN)

In [15]:
X.shape

(67065, 1990)

In [16]:
y = toxic.toxic.values

In [17]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05, random_state=41)

In [18]:
inputs = tf.keras.layers.Input(shape=(MAX_LEN,))
embeddings = tf.keras.layers.Embedding(input_dim=len(word2id), output_dim=40)(inputs)

conv_1 = tf.keras.layers.Conv1D(kernel_size=5, filters=10, strides=2)(embeddings)
conv_2 = tf.keras.layers.Conv1D(kernel_size=5, filters=20, strides=2)(conv_1)
pool_1 = tf.keras.layers.AveragePooling1D()(conv_2)
drop_1 = tf.keras.layers.Dropout(0.1)(pool_1)
conv_3 = tf.keras.layers.Conv1D(kernel_size=5, filters=30, strides=2)(drop_1)
pool_2 = tf.keras.layers.AveragePooling1D()(conv_3)
drop_2 = tf.keras.layers.Dropout(0.1)(pool_2)

concat = tf.keras.layers.Flatten()(drop_1)
dense = tf.keras.layers.Dense(64, activation='relu')(concat)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

model = tf.keras.Model(inputs=inputs, outputs=outputs)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=[f1])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [19]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('model.weights', 
                                                monitor='val_f1', 
                                                verbose=1, 
                                                save_weights_only=True, 
                                                save_best_only=True,
                                                mode='max', 
                                                save_freq='epoch' 
                                               )

early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_f1', 
                                              min_delta=0.01, 
                                              patience=3, 
                                              verbose=1, 
                                              mode='max',
                                              )

In [20]:
model.fit(X_train, y_train, 
          validation_data=(X_valid, y_valid),
          batch_size=3000,
          epochs=10,
          callbacks=[checkpoint, early_stop])

Train on 63711 samples, validate on 3354 samples
Epoch 1/10
Epoch 00001: val_f1 improved from -inf to 0.00000, saving model to model.weights
Epoch 2/10
Epoch 00002: val_f1 did not improve from 0.00000
Epoch 3/10
Epoch 00003: val_f1 improved from 0.00000 to 0.02703, saving model to model.weights
Epoch 4/10
Epoch 00004: val_f1 improved from 0.02703 to 0.16525, saving model to model.weights
Epoch 5/10
Epoch 00005: val_f1 improved from 0.16525 to 0.34358, saving model to model.weights
Epoch 6/10
Epoch 00006: val_f1 improved from 0.34358 to 0.43039, saving model to model.weights
Epoch 7/10
Epoch 00007: val_f1 improved from 0.43039 to 0.45147, saving model to model.weights
Epoch 8/10
Epoch 00008: val_f1 improved from 0.45147 to 0.49812, saving model to model.weights
Epoch 9/10
Epoch 00009: val_f1 did not improve from 0.49812
Epoch 10/10
Epoch 00010: val_f1 did not improve from 0.49812


<tensorflow.python.keras.callbacks.History at 0x7fa9627fd190>

In [21]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1990)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 1990, 40)          1658920   
_________________________________________________________________
conv1d (Conv1D)              (None, 993, 10)           2010      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 495, 20)           1020      
_________________________________________________________________
average_pooling1d (AveragePo (None, 247, 20)           0         
_________________________________________________________________
dropout (Dropout)            (None, 247, 20)           0         
_________________________________________________________________
flatten (Flatten)            (None, 4940)              0     

### Поиск аутлаеров

In [138]:
sample = toxic.sample(frac=.1)

In [139]:
sample.shape

(6706, 8)

In [140]:
tfidf = TfidfVectorizer(max_features=10000, min_df=5, ngram_range=(1,2), max_df=0.4)

In [136]:
cv = CountVectorizer(min_df=0.02, max_df=0.5, max_features=1000)

In [141]:
X = tfidf.fit_transform(sample['comment_text'])
y = sample['toxic']

In [151]:
pca = PCA(n_components=2)
sample = X
X_pca = pca.fit_transform(sample.toarray())

In [152]:
cluster = DBSCAN(min_samples = 10, eps = 0.02)
cluster.fit(X_pca)

DBSCAN(algorithm='auto', eps=0.02, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=10, n_jobs=None, p=None)

In [153]:
set(cluster.labels_)

{-1, 0, 1, 2, 3, 4, 5}

In [154]:
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X[:8000], labels[:8000]))
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels)) 
print("Completeness: %0.3f" % metrics.completeness_score(y, labels)) 
print("V-measure: %0.3f" % metrics.v_measure_score(y, labels))
print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(y, labels))
print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(y, labels))

Silhouette Coefficient: -0.255
Homogeneity: 0.183
Completeness: 0.018
V-measure: 0.033
Adjusted Rand Index: 0.002
Adjusted Mutual Information: -0.001


In [155]:
indices = [i for i, label in enumerate(cluster.labels_) if label == -1]

Найти что-то необычное не удалось, но приведенные ниже примеры отличаются либо лаконичностью, либо избыточной длиной, либо использованием необычных ругательств.

In [166]:
toxic.loc[542, ['comment_text', 'toxic']]

comment_text    Hi Fracophonie,\n\nThanks for taking the time to write such a long message - I appreciate this.\n\nIn terms of what I said about the age - I didn't mean to cause offense. I thought I read somewhere that you had to be over 18 to make these decisions but I think I was getting confused with the check user privileges. \n\nI have already read some of those policies but will read the rest later today. I am not a troll - I just need time to get used to all these acronyms, policies etc.\n\nFrom what started out with me writing an article about my favourite website has turned into a massive thing which was not what I was expecting.\n\nI really don't have any more time to argue and debate with the usrs since its clear they just want it deleted despite what I have said. For example one user just wrote there is another site called Amirite.net. This is Amirite.com!! This is precisely why I had to keep responding on the AFC delete page. \n\nIn terms of the rest I agree to do them (of

In [173]:
toxic.loc[688, ['comment_text']]

comment_text    "\n\n Past Masters vs. Rarities \n\nI'm creating a chart for my own use that includes U.S. albums, since there are so many differences in the early albums. As a U.S. fan, I'm faced with the issue of whether to have U.S. albums, U.K. albums or both on my iPod? I will post my chart on my talk page so that others can judge whether it's worthy of including here. I realize there will be problems, including the lack of Canadian albums. My purpose is simply to record the first ALBUM appearance in both the U.K. and U.S. of each song.\n\nI notice that a number of U.K. songs are listed as Past Masters when they actually first appeared on Rarities (1978 The Beatles album). I don't know the reason for this if the purpose is to list the first appearance on an album. Can someone explain? Or should they be changed to Rarities?  "
Name: 688, dtype: object

In [177]:
toxic.loc[947, ['comment_text']]

comment_text    Our Talk Archives: 1 2 3 4 5 6 7
Name: 947, dtype: object

In [178]:
toxic.loc[951, ['comment_text']]

comment_text    "\nI understand your position and think that your point is perfectly valid. However, I don't deem ""Iaşi-Chişinău"" a creation of Wikipedia, even with the lack of historical sources for that effect in the English language. I'll think more about this tomorrow. Thank you for explaining your position to me with such accurate detail. Best regards, nd "
Name: 951, dtype: object

In [194]:
toxic.loc[1379, ['comment_text']]

comment_text    "I'm back. I already created  ID (occasionally forget to log in but you can see the same class C when I do) I collected a massive number of diff edits on Balkanfever who pretends to be neutral but is  on record] getting blocked for saying this about Greeks under a previous handle. (not to mention his personal talk page is one big anti-Greek rant fest pointing to questionable FYROM news sources with highly exaggerated sensationalist titles)\n\n""No, Assfuckers (has a nice ring to it D) use it as a pejorative term. \n\nI was ready to pull the trigger on both him, admin Futper, and a few others (e.g. Macedoniaboy who says he is a ""proud fighter for united Macedonia on his talk page) with a detailed complaint about anti-Greek propaganda but since Futper managed a civil discussion with me and now seems to be sticking to debating articles (rather than threats of blocking)  I'm going to avoid escalating this further for now. ns. \n\nAs for the current naming disputing article

In [195]:
toxic.loc[1383, ['comment_text']]

comment_text    Can you, or anyone, please tell me what's going on? Email me or something, whatever it takes. This is ridiculous. I don't care for games. ++: t/c
Name: 1383, dtype: object

In [200]:
toxic.loc[1529, ['comment_text']]

comment_text    It was not been written as a commercial work, so is not copyrighted, and in the Public domain.
Name: 1529, dtype: object

In [204]:
toxic.loc[1636, ['comment_text']]

comment_text    "\n\n Summary of the AfD Debate \n\nNegative Votes\n\n delete - The original editor was User:Ivygohnair so it was marked as a vanity article for violating WP:AUTO.  Mapetite526 \n Strong Delete - Ignorance of the rules is not a valid argument for keeping an article. Vyse \n\nNeutral Positive Votes\n\n Procedural nomination. Speedy A7 was applied, but was contested and this warrants a further look. I'm neutral for now (even though this article doesn't seem to be). ColourBurst \n\n Comment/Question:CSD A7 is ""Unremarkable people, groups, companies and websites."". Is that the reason for the AfD too? According to Wikipedia:Vanity_guidelines ""As explained below, an author's conflict of interest by itself is not a basis for deletion, but lack of assertion of notability is."" Edward Wakelin \n \n It is true that I have come into the fray to defend Ivy Goh Nair from speedy deletion and that her last page was actually uploaded by me. I think if you want to apply the ""vanity"

In [218]:
toxic.loc[1976, ['comment_text']]

comment_text    Fartsalot56 says f**k you motherclucker!!
Name: 1976, dtype: object

In [227]:
toxic.loc[2229, ['comment_text']]

comment_text    "\n\nYou have been blocked for 24 hours.  (Talk)  (Contribs) "
Name: 2229, dtype: object

In [237]:
toxic.loc[2447, ['comment_text']]

comment_text    Attention Wikipedia Administrators \n\nUnder mentioned data is for your observations.\n\nThis talk page have been misused by the so called intelligent editors, which can be seen by the Archive of this page. \nArchive-1, have 18 printed pages of A4 size.\nArchive-2, have 21 printed pages of A4 size.\nArchive-3, have 28 printed pages of A4 size.\nArchive-4, have 40 printed pages of A4 size.\nArchive-5, have 44 printed pages of A4 size.\nArchive-6, have 57 printed pages of A4 size.\nArchive-7, have 31 printed pages of A4 size.\nArchive-8, have 35 printed pages of A4 size.\nArchive-9, have 29 printed pages of A4 size.\nArchive-10, have 32 printed pages of A4 size.\nArchive-11, have 40 printed pages of A4 size.\nArchive-12, have 15 printed pages of A4 size.\nArchive-13, have 47 printed pages of A4 size.\n\nThe current talk pages have 25 printed pages material of A4 size paper. Total 464 [ Four Hundred Sixty Four] pages approximately have been used in the discussion. What is 

In [246]:
toxic.loc[2748, ['comment_text']]

comment_text    [name of possible attack site]
Name: 2748, dtype: object

In [251]:
toxic.loc[2980, ['comment_text']]

comment_text    You do not know that we have two different categories, 7th octave and whistle register? YOU CREATED THEM. See Minnie Ripperton for an artist with links to both cats.
Name: 2980, dtype: object

In [257]:
toxic.loc[3257, ['comment_text']]

comment_text    Wow... \n\nSomeone sure is aggressive...XP
Name: 3257, dtype: object

In [275]:
toxic.loc[3616, ['comment_text']]

comment_text    NB:  has received a 24h block for WP:3RR violation.
Name: 3616, dtype: object

In [291]:
toxic.loc[4128, ['comment_text']]

comment_text    POV\nWhy is Wikipedia deciding how many times he should have shot intruders??   Why don't you just stop being what you're kNOWN for, Wiki, BIAS.  Just tell the facts.
Name: 4128, dtype: object

In [306]:
toxic.loc[4678, ['comment_text']]

Name: 4678, dtype: object

In [315]:
toxic.loc[5046, ['comment_text']]

comment_text    "\n\n Thanks! \n\nThanks for the barnstar! That was unexpected and certainly not necessary, but very much appreciated! Yeah, I guess the Wikicup has got my GA and DYK juices really flowing this month, lol, I appreciate your reviews, your support and your kind words! — ter Ka "
Name: 5046, dtype: object

In [321]:
toxic.loc[5327, ['comment_text']]

comment_text    So, is it totally dead or we have a chance to see it sometime?
Name: 5327, dtype: object

In [332]:
toxic.loc[5805, ['comment_text']]

comment_text    Now all you need to do is press unblock. Please let me edit again. I won't do any of this ever again.
Name: 5805, dtype: object