In [263]:
import re
import nltk
nltk.download('stopwords')
import pymorphy2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gensim
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing  import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vald_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [248]:
SEED = 42

In [249]:
Xi_train = pd.read_csv("train.csv")
Xi_train.head()

Unnamed: 0,id,url,title,target
0,0,m.kp.md,"Экс-министр экономики Молдовы - главе МИДЭИ, ц...",False
1,1,www.kp.by,Эта песня стала известна многим телезрителям б...,False
2,2,fanserials.tv,Банши 4 сезон 2 серия Бремя красоты смотреть о...,False
3,3,colorbox.spb.ru,Не Беси Меня Картинки,False
4,4,tula-sport.ru,В Новомосковске сыграют следж-хоккеисты алекси...,False


In [250]:
yi_train = Xi_train["target"]
yi_train.head()

0    False
1    False
2    False
3    False
4    False
Name: target, dtype: bool

In [251]:
Xi_test = pd.read_csv("test.csv")
Xi_test.head()

Unnamed: 0,id,url,title
0,135309,www.kommersant.ru,Шестой кассационный суд в Самаре начнет работу...
1,135310,urexpert.online,"Что такое индексация алиментов, кем и в каких ..."
2,135311,imperimeha.ru,Женщинам | Империя Меха - Part 12
3,135312,national-porn.com,"Небритые, волосатые киски: Порно всех стран и ..."
4,135313,2gis.ru,67


In [253]:
lemmatizer = pymorphy2.MorphAnalyzer()

In [254]:
stop_words = set(nltk.corpus.stopwords.words())
len(stop_words)

9380

In [255]:
def preprocessor(text):
    whitespaced_text = re.sub("[^а-яёa-z]", ' ', text.lower())
    clear_text = re.sub(' +', ' ',  whitespaced_text)
    words = []
    for word in clear_text.split():
        inf_word = lemmatizer.parse(word.replace("ё", "е"))[0].normal_form
        if inf_word not in stop_words:
            words.append(inf_word)
    return ' '.join(words)

In [256]:
Xi_train["title_clear"] = Xi_train["title"].apply(preprocessor)

In [257]:
Xi_test["title_clear"] = Xi_test["title"].apply(preprocessor)

In [258]:
Xi_train["url_clear"] = Xi_train["url"].apply(preprocessor)

In [259]:
Xi_test["url_clear"] = Xi_test["url"].apply(preprocessor)

In [260]:
def add_url(row):
    return row["title_clear"] + " " + row["url_clear"]

In [261]:
Xi_train["url+title"] = Xi_train.apply(add_url, axis=1)

In [262]:
Xi_test["url+title"] = Xi_test.apply(add_url, axis=1)

In [264]:
model = Pipeline([
    (
        'vectorizer',
        CountVectorizer(
            lowercase=False,
            analyzer='char',
            ngram_range=(2,6)
        )
    ),
    ('clf', LinearSVC())
])

In [276]:
X_train, X_val, y_train, y_val = train_test_split(Xi_train, yi_train, train_size=0.75, random_state=SEED,
                                                  shuffle=False)

In [277]:
model.fit(X_train["url+title"], y_train)



In [278]:
f1_score(y_train, model.predict(X_train["url+title"]))

0.9995995514976775

In [279]:
f1_score(y_val, model.predict(X_val["url+title"]))

0.986758916855541

In [265]:
model.fit(Xi_train["url+title"], yi_train)



In [266]:
yi_pred = model.predict(Xi_test["url+title"])

In [267]:
res = pd.DataFrame({'id': range(135309, 135309+len(yi_pred)), 'target': yi_pred})

In [268]:
res.to_csv('result.csv', index=False)