# 1. Data processing, Naive Bayes

In [310]:
import pandas as pd

In [320]:
df = pd.read_csv("dataset/train.tsv", sep="\t")

Импорт файла в датафрейм

In [321]:
df.head()

Unnamed: 0,title,is_fake
0,Москвичу Владимиру Клутину пришёл счёт за вмеш...,1
1,Агент Кокорина назвал езду по встречке житейск...,0
2,Госдума рассмотрит возможность введения секрет...,1
3,ФАС заблокировала поставку скоростных трамваев...,0
4,Против Навального завели дело о недоносительст...,1


In [322]:
df['title'].str.lower().str.split(expand=True).stack().value_counts().head(10)

в         2486
на        1244
с          500
и          494
за         427
по         381
россии     378
о          367
для        314
из         261
dtype: int64

In [323]:
banned = ['в', 'с', 'и', 'о', 'к', 'у', 'за', 'по', 'из', 'для',
          'от', 'не', 'из-за', 'на', 'до', 'об', 'во']

f = lambda x: ' '.join([item for item in x.lower().split() if item not in banned])

df['title'] = df['title'].apply(f)

In [324]:
df['title'].str.split(expand=True).stack().value_counts().head()

россии    378
сша       153
после      94
рублей     94
года       91
dtype: int64

In [325]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

In [326]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import numpy as np

clf = Pipeline([('cnv', CountVectorizer()),
      ('tfidf', TfidfTransformer()),
      ('nb', MultinomialNB())])

clf.fit(X_train['title'], X_train['is_fake'])
predicted = clf.predict(X_test['title'])
print(classification_report(X_test['is_fake'], predicted))

print(np.mean(X_test['is_fake'] == predicted))

              precision    recall  f1-score   support

           0       0.85      0.80      0.82       562
           1       0.82      0.87      0.84       590

    accuracy                           0.83      1152
   macro avg       0.84      0.83      0.83      1152
weighted avg       0.84      0.83      0.83      1152

0.8342013888888888


In [327]:
from sklearn.model_selection import GridSearchCV

parameters_nb = {'cnv__ngram_range': [(1, 1), (1, 2)],
                 'tfidf__use_idf': (True, False),
                 'nb__alpha': (1.0, 1e-1, 1e-2, 1e-3, 0),
                 }

gs = GridSearchCV(clf, parameters_nb, scoring="accuracy", n_jobs=-1, cv=5, verbose=10,
                     return_train_score=True)

gs = gs.fit(df["title"], df["is_fake"])

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [328]:
gs.best_score_

0.8417862788396564

In [329]:
pd.DataFrame(gs.cv_results_)[["mean_test_score",
                                 "param_cnv__ngram_range",
                                 "param_tfidf__use_idf",
                                 "param_nb__alpha"]].sort_values(by=["mean_test_score"], ascending=False).head()

Unnamed: 0,mean_test_score,param_cnv__ngram_range,param_tfidf__use_idf,param_nb__alpha
13,0.841786,"(1, 2)",False,0.1
12,0.840225,"(1, 2)",True,0.1
3,0.839702,"(1, 1)",False,0.1
2,0.838661,"(1, 1)",True,0.1
0,0.837271,"(1, 1)",True,1.0


# 2. SVM

In [347]:
from sklearn.linear_model import SGDClassifier

clf_svm = Pipeline([('cnv', CountVectorizer()),
      ('tfidf', TfidfTransformer()),
      ('svm', SGDClassifier())])

parameters_svm = {'cnv__ngram_range': [(1, 1),(1, 2)],
                  'tfidf__use_idf': (True, False),
                  'svm__alpha': (1e-4, 1e-5, 1e-6),
                  'svm__class_weight': (None, 'balanced')
                  }

gs_svm = GridSearchCV(clf_svm, parameters_svm, scoring="accuracy", n_jobs=-1, cv=5, verbose=10,
                      return_train_score=True)
gs_svm = gs_svm.fit(df["title"], df["is_fake"])

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [348]:
gs_svm.best_score_

0.8350137863210735

In [349]:
pd.DataFrame(gs_svm.cv_results_)[["mean_test_score",
                                 "param_cnv__ngram_range",
                                 "param_tfidf__use_idf",
                                 "param_svm__alpha",
                                 "param_svm__class_weight"]].sort_values(by=["mean_test_score"], ascending=False).head()

Unnamed: 0,mean_test_score,param_cnv__ngram_range,param_tfidf__use_idf,param_svm__alpha,param_svm__class_weight
12,0.835014,"(1, 2)",True,0.0001,
14,0.834841,"(1, 2)",True,0.0001,balanced
18,0.830844,"(1, 2)",True,1e-05,balanced
16,0.830149,"(1, 2)",True,1e-05,
2,0.829109,"(1, 1)",True,0.0001,balanced


# 3. Stemming

In [350]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("russian")

df['stemmed'] = df['title'].apply(lambda x: [stemmer.stem(y) for y in x.split()])
df['stemmed'] = df['stemmed'].apply(lambda x: " ".join([i for i in x]))

parameters_nb = {'cnv__ngram_range': [(1,1), (1, 2), (2,2)],
                 'tfidf__use_idf': ([True, False]),
                 'nb__alpha': (2, 1, 0.1, 0.01,0.001),
                 }

gs_stemmed = GridSearchCV(clf, parameters_nb, scoring="accuracy", n_jobs=-1, cv=5, verbose=10,
                     return_train_score=True)

gs_stemmed = gs_stemmed.fit(df["stemmed"], df["is_fake"])

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [351]:
gs_stemmed.best_score_

0.8511645972101555

In [352]:
pd.DataFrame(gs_stemmed.cv_results_)[["mean_test_score",
                                 "param_cnv__ngram_range",
                                 "param_tfidf__use_idf",
                                 "param_nb__alpha"]].sort_values(by=["mean_test_score"], ascending=False).head()

Unnamed: 0,mean_test_score,param_cnv__ngram_range,param_tfidf__use_idf,param_nb__alpha
15,0.851165,"(1, 2)",False,0.1
2,0.851164,"(1, 1)",True,1.0
5,0.85099,"(1, 1)",False,0.1
12,0.85099,"(1, 2)",True,1.0
14,0.848907,"(1, 2)",True,0.1


In [353]:
parameters_nb = {'cnv__ngram_range': [(1,1), (1, 2), (2,2)],
                 'tfidf__use_idf': ([True, False]),
                 'nb__alpha': (1, 0.8, 0.6, 0.4, 0.2, 0.1),
                 }

gs_stemmed = GridSearchCV(clf, parameters_nb, scoring="accuracy", n_jobs=-1, cv=5, verbose=10,
                     return_train_score=True)

gs_stemmed = gs_stemmed.fit(df["stemmed"], df["is_fake"])

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [354]:
gs_stemmed.best_score_

0.8532479305434887

In [355]:
pd.DataFrame(gs_stemmed.cv_results_)[["mean_test_score",
                                 "param_cnv__ngram_range",
                                 "param_tfidf__use_idf",
                                 "param_nb__alpha"]].sort_values(by=["mean_test_score"], ascending=False).head()

Unnamed: 0,mean_test_score,param_cnv__ngram_range,param_tfidf__use_idf,param_nb__alpha
16,0.853248,"(1, 2)",True,0.6
19,0.853247,"(1, 2)",False,0.4
21,0.852207,"(1, 2)",False,0.2
14,0.852205,"(1, 2)",True,0.8
20,0.852033,"(1, 2)",True,0.2


# 4. Финальная модель

1. Удалить предлоги и союзы.
2. Стемминг. Для эффективного стемминга, необходимо убрать некоторые знаки пунктуации.
3. CountVectorizer(ngram_range=(1,2))
Tfidftransformer(use_idf=True)
MultinomialNB(alpha=0.6)

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from nltk.stem.snowball import SnowballStemmer


# removing unnecessary characters and stemming
def stemming(string: str):
    stop_words = ['в', 'с', 'и', 'о', 'к', 'у', 'за', 'по', 'из', 'для',
                  'от', 'не', 'из-за', 'на', 'до', 'об', 'во']
    stemmer = SnowballStemmer("russian")
    text = re.sub('[^A-Za-z0-9 а-яА-я+:]+', '', string.lower())
    text = [item for item in text.split() if item not in stop_words]
    text = [stemmer.stem(y) for y in text]
    text = ' '.join([i for i in text])
    return text

df = pd.read_csv("dataset/train.tsv", sep="\t")

# creating train and test datasets
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

# training classifier
clf = Pipeline([('cnv', CountVectorizer(preprocessor=stemming, ngram_range=(1,2))),
      ('tfidf', TfidfTransformer(use_idf=False)),
      ('nb', MultinomialNB(alpha=0.4))])
clf.fit(X_train['title'], X_train['is_fake'])

Pipeline(steps=[('cnv',
                 CountVectorizer(ngram_range=(1, 2),
                                 preprocessor=<function stemming at 0x0000019A43E20DC0>)),
                ('tfidf', TfidfTransformer(use_idf=False)),
                ('nb', MultinomialNB(alpha=0.4))])

In [3]:
predicted = clf.predict(X_test['title'])
print(classification_report(X_test['is_fake'], predicted))
print(np.mean(X_test['is_fake'] == predicted))

              precision    recall  f1-score   support

           0       0.86      0.85      0.85       562
           1       0.86      0.87      0.86       590

    accuracy                           0.86      1152
   macro avg       0.86      0.86      0.86      1152
weighted avg       0.86      0.86      0.86      1152

0.859375


# 5. Обработка test.tsv

In [4]:
clf.fit(df['title'],df['is_fake'])

test = pd.read_csv("dataset/test.tsv", sep='\t')
predicted = clf.predict(test['title'])
test['is_fake'] = predicted

test.to_csv("predictions.tsv", index=False, sep="\t")