In [1]:
import numpy as np
import pandas as pd

In [2]:
import re
import spacy
import string
from string import digits
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from catboost import CatBoostClassifier

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, make_scorer
from sklearn.utils import shuffle

In [5]:
import torch
import transformers
from tqdm import notebook

In [20]:
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\OB\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\OB\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\OB\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if device == torch.device('cpu'):
    print('Using cpu')
else:
    n_gpu = torch.cuda.device_count()
    print('Using {} GPUs'.format(torch.cuda.get_device_name(0)))

Using GeForce RTX 3080 GPUs


# Cleaning the data

In [21]:
df_open = pd.read_csv('C:/Users/OB/Desktop/projects_to_do/project_7/toxic_comments.csv')
display(df_open)

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
...,...,...
159566,""":::::And for the second time of asking, when ...",0
159567,You should be ashamed of yourself \n\nThat is ...,0
159568,"Spitzer \n\nUmm, theres no actual article for ...",0
159569,And it looks like it was actually you who put ...,0


In [18]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

In [22]:
PUNCT_TO_REMOVE = string.punctuation
STOPWORDS = set(stopwords.words('english'))

stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
def remove_numbers(text):
    return text.translate(str.maketrans('', '', digits))
def known_contractions(text):
    for word in text.split():
        if word.lower() in contraction_mapping:
            text = text.replace(word, contraction_mapping[word.lower()])
    return text
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

In [23]:
%%time
df = df_open.copy()
df['text']=df['text'].astype(str) #fix the format
df['text']=df['text'].str.lower() #lower the strings
df['text']=df['text'].apply(lambda text: remove_numbers(text)) #delete numbers
df['text']=df['text'].apply(lambda text: known_contractions(text)) #check the grammar
df['text']=df['text'].replace('https?:\/\/.*\/\w*', '', regex=True) #delete hyperlinks
df['text']=df['text'].replace('#', ' ', regex=True) #delete hashtags
df['text']=df['text'].replace('\@\w*', '', regex=True) #delete quotes
df['text']=df['text'].replace('\$\w*', '', regex=True) #delete tickers
df['text']=df['text'].apply(lambda text: remove_punctuation(text)) #delete punktuation
df['text']=df['text'].apply(lambda text: remove_stopwords(text)) #delete stopwords
df['text']=df['text'].replace('\&*[amp]*\;|gt+', '', regex=True) #delete quotes
df['text']=df['text'].replace('\s+rt\s+', '', regex=True) #delete RT
df['text']=df['text'].replace('[\n\t\r]+', ' ', regex=True) #delete linebreak, tab, return
df['text']=df['text'].replace('via+\s', '', regex=True) #delete via
df['text']=df['text'].replace('\s+\s+', ' ', regex=True) #убираем hyperspaces
display(df)

Unnamed: 0,text,toxic
0,explanation edits made username hardcore metal...,0
1,daww matches background colour seemingly stuck...,0
2,hey man really trying edit war guy constantly ...,0
3,cannot make real suggestions improvement wonde...,0
4,sir hero chance remember page,0
...,...,...
159566,second time asking view completely contradicts...,0
159567,ashamed horrible thing put talk page,0
159568,spitzer umm theres actual article prostitution...,0
159569,looks like actually put speedy first version d...,0


Wall time: 8.22 s


In [24]:
%%time
df['text_stems'] = df['text'].apply(stem_words)
df['text_lemmas'] = df['text'].apply(lemmatize_words)
display(df)

Unnamed: 0,text,toxic,text_stems,text_lemmas
0,explanation edits made username hardcore metal...,0,explan edit made usernam hardcor metallica fan...,explanation edits make username hardcore metal...
1,daww matches background colour seemingly stuck...,0,daww match background colour seem stuck thank ...,daww match background colour seemingly stuck t...
2,hey man really trying edit war guy constantly ...,0,hey man realli tri edit war guy constant remov...,hey man really try edit war guy constantly rem...
3,cannot make real suggestions improvement wonde...,0,cannot make real suggest improv wonder section...,cannot make real suggestion improvement wonder...
4,sir hero chance remember page,0,sir hero chanc rememb page,sir hero chance remember page
...,...,...,...,...
159566,second time asking view completely contradicts...,0,second time ask view complet contradict covera...,second time ask view completely contradict cov...
159567,ashamed horrible thing put talk page,0,asham horribl thing put talk page,ashamed horrible thing put talk page
159568,spitzer umm theres actual article prostitution...,0,spitzer umm there actual articl prostitut ring...,spitzer umm there actual article prostitution ...
159569,looks like actually put speedy first version d...,0,look like actual put speedi first version dele...,look like actually put speedy first version de...


Wall time: 4min 52s


In [25]:
features = df['text_lemmas']
target = df['toxic']
features_train, features_test, target_train, target_test = train_test_split(
    features, target, test_size=0.2, random_state=123455)

In [26]:
count_tf_idf = TfidfVectorizer()
tf_idf_train_feautures = count_tf_idf.fit_transform(features_train)
tf_idf_test_feautures = count_tf_idf.transform(features_test)
#print(tf_idf_train_feautures)
#print(tf_idf_test_feautures)
print("Matrix shape:", tf_idf_train_feautures.shape)
print("Matrix shape:", tf_idf_test_feautures.shape)
print("Matrix shape:", target_train.shape)
print("Matrix shape:", target_test.shape)

Размер матрицы: (127656, 181158)
Размер матрицы: (31915, 181158)
Размер матрицы: (127656,)
Размер матрицы: (31915,)


# General TF-IDF + ML approach

In [27]:
model = LogisticRegression(random_state=12345,solver='liblinear',class_weight='balanced')
model.fit(tf_idf_train_feautures, target_train)
predictions = model.predict(tf_idf_test_feautures)
print("Matrix shape:", predictions.shape)
print("Matrix shape:", target_test.shape)
f1 = f1_score(target_test, predictions)
print("f1 log:",f1)

Matrix shape: (31915,)
Matrix shape: (31915,)
f1 log: 0.7485461091110496


In [28]:
params = {
    "loss" : ["hinge", "log", "squared_hinge", "modified_huber"],
    "alpha" : [0.0001, 0.001, 0.01, 0.1],
    "penalty" : ["l2", "l1", "none"],
}

modelsgd = SGDClassifier(max_iter=1000,shuffle=True,random_state=12345)
grid_search_sgd = GridSearchCV(modelsgd, param_grid=params)
grid_search_sgd.fit(tf_idf_train_feautures, target_train)
print(grid_search_sgd.best_estimator_)

SGDClassifier(penalty='none', random_state=12345)


In [29]:
sgd = SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='none',
              power_t=0.5, random_state=12345, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)
sgd.fit(tf_idf_train_feautures, target_train)
predictions_sgd = sgd.predict(tf_idf_test_feautures)
print("Matrix shape:", predictions_sgd.shape)
print("Matrix shape:", target_test.shape)
f1_sgd = f1_score(target_test, predictions_sgd)
print("f1 sgd:",f1_sgd)

Размер матрицы: (31915,)
Размер матрицы: (31915,)
f1 sgd: 0.7732629727352682


In [26]:
model_cb = CatBoostClassifier()
model_cb_index = [0,1,2,3,4,5,6]
params_dist_cb = {'iterations': [500],
          'learning_rate':[0.01,0.05,0.1],
          'loss_function':['Logloss', 'CrossEntropy'],
          'l2_leaf_reg': np.logspace(-20, -19, 3),
          'leaf_estimation_iterations': [10],
           'eval_metric': ['F1'],
           'use_best_model': ['True'],
          'logging_level':['Silent'],
          'random_seed': [123455]
         }
grid_search_cb = GridSearchCV(model_cb, params_dist_cb, scoring=make_scorer(f1_score))
grid_search_cb.fit(tf_idf_train_feautures, target_train)
print(grid_search_cb.best_estimator_)

In [27]:
cb = CatBoostClassifier(iterations=500, learning_rate = 0.01,loss_function='CrossEntropy',random_seed=42)
cb.fit(tf_idf_train_feautures, target_train)
predictions_cb = cb.predict(tf_idf_test_feautures)
print("Matrix shape:", predictions_cb.shape)
print("Matrix shape:", target_test.shape)
f1_cb = f1_score(target_test, predictions_cb)
print("f1 cb:",f1_cb)

0:	learn: 0.6824394	total: 5.7s	remaining: 47m 23s
1:	learn: 0.6714687	total: 10.7s	remaining: 44m 23s
2:	learn: 0.6611103	total: 15.6s	remaining: 43m 3s
3:	learn: 0.6514688	total: 20.5s	remaining: 42m 20s
4:	learn: 0.6419060	total: 25.4s	remaining: 41m 53s
5:	learn: 0.6324299	total: 30.3s	remaining: 41m 33s
6:	learn: 0.6231531	total: 35.2s	remaining: 41m 17s
7:	learn: 0.6137622	total: 40.3s	remaining: 41m 17s
8:	learn: 0.6045535	total: 45.2s	remaining: 41m 5s
9:	learn: 0.5955023	total: 50.1s	remaining: 40m 54s
10:	learn: 0.5870111	total: 55.1s	remaining: 40m 48s
11:	learn: 0.5785621	total: 60s	remaining: 40m 39s
12:	learn: 0.5700064	total: 1m 4s	remaining: 40m 34s
13:	learn: 0.5620340	total: 1m 9s	remaining: 40m 25s
14:	learn: 0.5542361	total: 1m 14s	remaining: 40m 20s
15:	learn: 0.5467100	total: 1m 19s	remaining: 40m 16s
16:	learn: 0.5391806	total: 1m 24s	remaining: 40m 11s
17:	learn: 0.5319983	total: 1m 29s	remaining: 40m 4s
18:	learn: 0.5246560	total: 1m 34s	remaining: 40m 2s
19:	l

In [28]:
clf = SVC(kernel='linear',C=0.05,random_state=12345)
clf.fit(tf_idf_train_feautures, target_train)
predictions_clf = clf.predict(tf_idf_test_feautures)
print("Matrix shape:", predictions_clf.shape)
print("Matrix shape:", target_test.shape)
f1_clf = f1_score(target_test, predictions_clf)
print("f1 clf:",f1_clf)

Размер матрицы: (31915,)
Размер матрицы: (31915,)
f1 clf: 0.5835356433458398


In [29]:
knn = KNeighborsClassifier(n_neighbors=100)
knn.fit(tf_idf_train_feautures, target_train)
predictions_knn = knn.predict(tf_idf_test_feautures)
print("Matrix shape:", predictions_knn.shape)
print("Matrix shape:", target_test.shape)
f1_knn = f1_score(target_test, predictions_knn)
print("f1 knn:",f1_knn)

Размер матрицы: (31915,)
Размер матрицы: (31915,)
f1 knn: 0.056303549571603426


# BERT

In [41]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
model = transformers.BertModel.from_pretrained('bert-base-uncased')
device = "cuda:0"
model.to(device)
model.train()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [76]:
df_ones = df[df['toxic'] == 1].sample(10000, random_state=123)
df_zeros = df[df['toxic'] == 0].sample(10000, random_state=123)
new_df = shuffle(pd.concat([df_ones] + [df_zeros]))

tokenized = new_df['text'].apply(lambda x: tokenizer.encode(x[:512], add_special_tokens=True))
max_len = max(map(len, tokenized))
padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)

In [77]:
batch_size = 20 
embeddings = []
for i in notebook.tqdm(range(padded.shape[0] // batch_size)):
    batch = torch.cuda.LongTensor(padded[batch_size*i:batch_size*(i+1)])
    attention_mask_batch = torch.cuda.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)]).to(device)
        
    with torch.no_grad():
        batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        
    embeddings.append(batch_embeddings[0][:,0,:].cpu().detach().numpy())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [79]:
features_bert = np.concatenate(embeddings)
target_bert = new_df['toxic']
features_train_bert, features_test_bert, target_train_bert, target_test_bert = train_test_split(features_bert, target_bert, test_size=0.2, random_state=123)


F1 train: 0.8733
F1 test: 0.8404


In [83]:
model_lr = LogisticRegression(solver='liblinear', random_state=123455)
model_lr.fit(features_train_bert, target_train_bert)

pred_train = model_lr.predict(features_train_bert)
pred_test = model_lr.predict(features_test_bert)

print('F1 train: {:.4f}'.format(f1_score(target_train_bert, pred_train)))
print('F1 test: {:.4f}'.format(f1_score(target_test_bert, pred_test)))

F1 train: 0.8733
F1 test: 0.8404


In [82]:
params = {
    "loss" : ["hinge", "log", "squared_hinge", "modified_huber"],
    "alpha" : [0.0001, 0.001, 0.01, 0.1],
    "penalty" : ["l2", "l1", "none"],
}

modelsgd = SGDClassifier(max_iter=100,shuffle=True,random_state=12345)
grid_search_sgd = GridSearchCV(modelsgd, param_grid=params)
grid_search_sgd.fit(features_train_bert, target_train_bert)
print(grid_search_sgd.best_estimator_)

SGDClassifier(alpha=0.01, max_iter=100, random_state=12345)


In [86]:
sgd_bert = SGDClassifier(alpha=0.01, max_iter=100, random_state=12345)
sgd.fit(features_train_bert, target_train_bert)
predictions_sgd = sgd.predict(features_test_bert)
print("Matrix shape:", predictions_sgd.shape)
print("Matrix shape:", target_test.shape)
f1_sgd = f1_score(target_test_bert, predictions_sgd)
print("f1 BERT SGD:",f1_sgd)

Matrix shape: (4000,)
Matrix shape: (400,)
f1 sgd: 0.8484401866863179
