In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import *

Sample dummy submission:

In [108]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

File with answers:

In [128]:
y_test = pd.read_csv('test_data_labels.csv')

In [117]:
y_test.sample()

Unnamed: 0,comment_id,toxic
2930,2930,0


In [129]:
y_test = list(y_test['toxic'])

In [109]:
sample_submission = [[i, randint(0,1)] for i in range(len(test))]
sample_submission = pd.DataFrame(sample_submission, columns=['comment_id', 'toxic'])

sample_submission.head()

Unnamed: 0,comment_id,toxic
0,0,0
1,1,1
2,2,0
3,3,1
4,4,1


In [84]:
sample_submission.to_csv('sample_submission.csv', index=False)

In [112]:
train['toxic'] = train.toxic.astype(int)

In [118]:
y_train = train['toxic']

##  baseline 1: no preprocessing + bow -> 4

In [119]:
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import CountVectorizer

In [120]:
vec = CountVectorizer(ngram_range=(1, 1)) # строим BoW для слов
bow = vec.fit_transform(train['comment'])

In [121]:
list(vec.vocabulary_.items())[:10]

[('преступление', 37547),
 ('наказание', 25277),
 ('именно', 16896),
 ('эти', 56140),
 ('неработающие', 27026),
 ('весы', 6226),
 ('показывают', 34999),
 ('что', 54839),
 ('работающих', 40401),
 ('нет', 27146)]

In [122]:
clf = LogisticRegression(random_state=42, max_iter=500)
clf.fit(bow, y_train)

LogisticRegression(max_iter=500, random_state=42)

In [130]:
pred = clf.predict(vec.transform(test['comment']))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.94      0.85      0.89      2628
           1       0.68      0.85      0.75       975

    accuracy                           0.85      3603
   macro avg       0.81      0.85      0.82      3603
weighted avg       0.87      0.85      0.85      3603



In [131]:
accuracy_score(pred, y_test)

0.848737163474882

In [90]:
baseline_4 = [[i, pred[i]] for i in range(len(test))]
baseline_4 = pd.DataFrame(baseline_4, columns=['comment_id', 'toxic'])

baseline_4.head()

Unnamed: 0,comment_id,toxic
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [92]:
baseline_4.to_csv('baseline_4.csv', index=False)

## baseline 2: preprocessing + bow -> 5

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
import re
from pymorphy2 import MorphAnalyzer
from functools import lru_cache
from nltk.corpus import stopwords

m = MorphAnalyzer()
regex = re.compile("[А-Яа-яA-z]+")

def words_only(text, regex=regex):
    try:
        return regex.findall(text.lower())
    except:
        return []

In [23]:
@lru_cache(maxsize=128)
def lemmatize_word(token, pymorphy=m):
    return pymorphy.parse(token)[0].normal_form

def lemmatize_text(text):
    return [lemmatize_word(w) for w in text]


mystopwords = stopwords.words('russian') 
def remove_stopwords(lemmas, stopwords = mystopwords):
    return [w for w in lemmas if not w in stopwords and len(w) > 3]

def clean_text(text):
    tokens = words_only(text)
    lemmas = lemmatize_text(tokens)
    
    return ' '.join(remove_stopwords(lemmas))

In [26]:
from tqdm import tqdm

lemmas = list(tqdm(map(clean_text, train['comment']), total=len(train)))
    
train['lemmas'] = lemmas
train.sample(5)

100%|██████████| 10809/10809 [00:34<00:00, 314.29it/s]


Unnamed: 0,comment,toxic,lemmas
8848,Видимо за очень дешево и черти где. У нас в Тю...,0,видимо очень дёшево чёрт тюмень вообще продава...
2605,"Ебать, безумие какое. Может чувак набухался и ...",1,ебать безумие мочь чувак набухаться решить суи...
3879,Спасибо за информацию. Я недавно делала немецк...,0,спасибо информация недавно делать немецкий заг...
8426,Это вам повезло похоже. Город с конца декабря ...,0,повезти похоже город конец декабрь февраль сто...
3068,Бутылированный хохол 100\n,1,бутылировать хохол


In [27]:
lemmas_test = list(tqdm(map(clean_text, test['comment']), total=len(test)))
    
test['lemmas'] = lemmas_test

100%|██████████| 3603/3603 [00:11<00:00, 310.68it/s]


In [93]:
vec = CountVectorizer(ngram_range=(1, 2)) # строим BoW для слов
bow = vec.fit_transform(train['lemmas'])

clf = LogisticRegression(random_state=42, max_iter=500)
clf.fit(bow, y_train)

pred = clf.predict(vec.transform(test['lemmas']))
accuracy_score(pred, y_test)

0.8573411046350263

In [95]:
baseline_5 = [[i, pred[i]] for i in range(len(test))]
baseline_5 = pd.DataFrame(baseline_5, columns=['comment_id', 'toxic'])

baseline_5.to_csv('baseline_5.csv', index=False)

## baseline 3: preproc + fasttext -> 6

In [34]:
import fasttext

In [39]:
with open('train_ft.txt', 'w') as f:
    for pair in list(zip(train['lemmas'], train['toxic'])):
        text, label = pair
        f.write(f'__label__{label} {text.lower()}\n')

In [41]:
with open('test_ft.txt', 'w') as f:
    for pair in list(zip(test['lemmas'], test['toxic'])):
        text, label = pair
        f.write(f'__label__{label} {text.lower()}\n')

In [99]:
classifier = fasttext.train_supervised('train_ft.txt')#, 'model')
result = classifier.test('test_ft.txt')
print('P@1:', result[1])#.precision)
print('R@1:', result[2])#.recall)
print('Number of examples:', result[0])#.nexamples)

P@1: 0.8739938939772411
R@1: 0.8739938939772411
Number of examples: 3603


In [100]:
pred = classifier.predict(list(test['lemmas']))[0]
pred = [int(label[0][-1]) for label in pred]

accuracy_score(list(y_test), pred)

0.8739938939772411

In [101]:
baseline_6 = [[i, pred[i]] for i in range(len(test))]
baseline_6 = pd.DataFrame(baseline_6, columns=['comment_id', 'toxic'])

baseline_6.to_csv('baseline_6.csv', index=False)