In [2]:
import warnings
warnings.filterwarnings("ignore")

import re
import razdel
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
import plotly
import pymorphy2
import nltk
nltk.download('stopwords')
from tqdm import tqdm_notebook
from sklearn.utils.class_weight import compute_class_weight
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize
from catboost import CatBoostClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, classification_report

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nikitakalmackiy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
TOKEN_PATTERN = "[а-яё]+"

SEED = 42
np.random.seed(SEED)

In [4]:
train_df = pd.read_csv("Train data.csv")
test_df = pd.read_csv("All Cups IntroML 2024 Spring.csv")

In [5]:
train_df.sample(10)

Unnamed: 0,ID,url,title,label
54589,54589,www.kinotrast.com,Сериал Качели 2018 Беларусь Все серии смотреть...,0
41071,41071,pc01.ru,Шкаф от спального гарнитура Энрике - 37 000 ру...,0
19161,19161,forum.zoneofgames.ru,SWAT 3: Close Quarters Battle - Русификаторы -...,0
75429,75429,librebook.me,"Читать онлайн электронную книгу Прометей, или ...",0
121091,121091,meduniver.com,Как научить щедрости собственного мужчину и от...,0
35498,35498,sutochno.ru,"2-комнатная квартира посуточно, Ашкелон, Ха-На...",0
51325,51325,finalniiiepra.xyz,Заявка принята,0
82954,82954,nabugre.com,Виза в Китай в Гонконге: получение для россиян...,0
87724,87724,stavropol.hh.ru,"Вакансии компании Карт Бланш, ООО ПКФ",0
45185,45185,spb.hh.ru,"Вакансия Менеджер по туризму в Архангельске, р...",0


In [6]:
train_df.shape

(135309, 4)

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135309 entries, 0 to 135308
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   ID      135309 non-null  int64 
 1   url     135309 non-null  object
 2   title   135309 non-null  object
 3   label   135309 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 4.1+ MB


In [8]:
train_df['label'].value_counts()

0    118594
1     16715
Name: label, dtype: int64

In [9]:
train_df.drop(columns=["ID"], inplace=True)

In [10]:
train_df.head()

Unnamed: 0,url,title,label
0,m.kp.md,"Экс-министр экономики Молдовы - главе МИДЭИ, ц...",0
1,www.kp.by,Эта песня стала известна многим телезрителям б...,0
2,fanserials.tv,Банши 4 сезон 2 серия Бремя красоты смотреть о...,0
3,colorbox.spb.ru,Не Беси Меня Картинки,0
4,tula-sport.ru,В Новомосковске сыграют следж-хоккеисты алекси...,0


In [11]:
model_pipeline = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("model", LogisticRegression())
]
)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(train_df["title"], train_df["label"], train_size=0.75)

model_pipeline.fit(X_train, y_train)

In [13]:
f1_score(y_true=y_test, y_pred=model_pipeline.predict(X_test))

0.9426260112009957

### Попробуем учесть дизбаланс классов

In [14]:
class_weights = compute_class_weight('balanced', classes=np.unique(train_df["label"]), y=train_df["label"])

In [15]:
class_weights

array([0.57047152, 4.04753216])

In [16]:
new_pipeline = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("model", LogisticRegression(class_weight=dict(enumerate(class_weights))))
]
)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(train_df["title"], train_df["label"], train_size=0.75)

new_pipeline.fit(X_train, y_train)

In [18]:
f1_score(y_true=y_test, y_pred=new_pipeline.predict(X_test))

0.9546279491833031

In [19]:
X_test = test_df["title"].values

test_df["label"] = new_pipeline.predict(X_test)

test_df[["ID", "label"]].to_csv("ml_baseline.csv", index=False)

!cat ml_baseline.csv | head

ID,label
135309,0
135310,0
135311,0
135312,1
135313,0
135314,0
135315,0
135316,0
135317,0
cat: stdout: Broken pipe


### Word2vec

Попытка сделать кластаризацию k-минсом на word2vec векторах. Не увенчалась успехом:(

In [32]:
y_true = train_df["label"]

In [33]:
corpus = train_df.title.values

In [34]:
lemmatizer = pymorphy2.MorphAnalyzer()

In [35]:
lemmatizer_cache = {}

def lemmatize(token):
    if lemmatizer.word_is_known(token):
        if token not in lemmatizer_cache:
            lemmatizer_cache[token] = lemmatizer.parse(token)[0].normal_form
        return lemmatizer_cache[token]
    return token

In [36]:
def tokenize(text):
    return re.findall(TOKEN_PATTERN, text.lower())

In [37]:
stopword_set = set(nltk.corpus.stopwords.words('russian'))

In [38]:
stopword_set

{'а',
 'без',
 'более',
 'больше',
 'будет',
 'будто',
 'бы',
 'был',
 'была',
 'были',
 'было',
 'быть',
 'в',
 'вам',
 'вас',
 'вдруг',
 'ведь',
 'во',
 'вот',
 'впрочем',
 'все',
 'всегда',
 'всего',
 'всех',
 'всю',
 'вы',
 'где',
 'да',
 'даже',
 'два',
 'для',
 'до',
 'другой',
 'его',
 'ее',
 'ей',
 'ему',
 'если',
 'есть',
 'еще',
 'ж',
 'же',
 'за',
 'зачем',
 'здесь',
 'и',
 'из',
 'или',
 'им',
 'иногда',
 'их',
 'к',
 'как',
 'какая',
 'какой',
 'когда',
 'конечно',
 'кто',
 'куда',
 'ли',
 'лучше',
 'между',
 'меня',
 'мне',
 'много',
 'может',
 'можно',
 'мой',
 'моя',
 'мы',
 'на',
 'над',
 'надо',
 'наконец',
 'нас',
 'не',
 'него',
 'нее',
 'ней',
 'нельзя',
 'нет',
 'ни',
 'нибудь',
 'никогда',
 'ним',
 'них',
 'ничего',
 'но',
 'ну',
 'о',
 'об',
 'один',
 'он',
 'она',
 'они',
 'опять',
 'от',
 'перед',
 'по',
 'под',
 'после',
 'потом',
 'потому',
 'почти',
 'при',
 'про',
 'раз',
 'разве',
 'с',
 'сам',
 'свою',
 'себе',
 'себя',
 'сейчас',
 'со',
 'совсем',
 'так

In [31]:
def prepare_sentence_dataset(documents):
    tokenized_sentences = []
    for document in tqdm_notebook(documents):
        for sentence in razdel.sentenize(document):
            lemmatized_tokens = [lemmatize(token) for token in tokenize(sentence.text)]
            tokenized_sentences.append(
                [token for token in lemmatized_tokens if token not in stopword_set]
            )
    return tokenized_sentences

sentence_dataset = prepare_sentence_dataset(corpus)

  0%|          | 0/135309 [00:00<?, ?it/s]

In [39]:
sentence_dataset[0]

['экс',
 'министр',
 'экономика',
 'молдова',
 'глава',
 'мидэи',
 'цель',
 'который',
 'сделать',
 'республика',
 'проситель',
 'донор',
 'избегать',
 'долгий',
 'нахождение',
 'н']

In [40]:
word2vec = gensim.models.Word2Vec(
    vector_size=100, sg=0, window=5, min_count=5, negative=20
)

In [41]:
word2vec.build_vocab(sentence_dataset)

In [42]:
word2vec.train(sentence_dataset, total_examples=word2vec.corpus_count, epochs=30)

(19824770, 24054810)

In [43]:
word2vec.wv.most_similar('порно')

[('оргия', 0.7556273341178894),
 ('анал', 0.7350456118583679),
 ('трах', 0.7266899943351746),
 ('азиатка', 0.7214621901512146),
 ('рогоносец', 0.7190147638320923),
 ('групповуха', 0.7180072665214539),
 ('лесбиянка', 0.7156546711921692),
 ('лесби', 0.7151614427566528),
 ('порнуха', 0.7144322991371155),
 ('пизда', 0.7116526961326599)]

In [44]:
def text_to_vector(text):
    tokens = nltk.tokenize.word_tokenize(text.lower())
    vectors = []
    for token in tokens:
        if token in word2vec.wv:
            vectors.append(word2vec.wv[token])
    if vectors:
        return sum(vectors) / len(vectors)
    else:
        return [0] * word2vec.vector_size

In [45]:
train_df['title'].iloc[0]

'Экс-министр экономики Молдовы - главе МИДЭИ, цель которого сделать из республики не просителя, а донора: Надо избегать долгого нахождения н�'

In [47]:
train_df

Unnamed: 0,url,title,label
0,m.kp.md,"Экс-министр экономики Молдовы - главе МИДЭИ, ц...",0
1,www.kp.by,Эта песня стала известна многим телезрителям б...,0
2,fanserials.tv,Банши 4 сезон 2 серия Бремя красоты смотреть о...,0
3,colorbox.spb.ru,Не Беси Меня Картинки,0
4,tula-sport.ru,В Новомосковске сыграют следж-хоккеисты алекси...,0
...,...,...,...
135304,mail.ru,пора тюльпанов турецкий сериал на русском язык...,0
135305,www.ntv.ru,Остросюжетный сериал «Шеф. Игра на повышение»....,0
135306,topclassiccarsforsale.com,"1941 Plymouth Special Deluxe Hot Rod, Automati...",0
135307,wowcream.ru,Купить It's Skin Сыворотка питательная Power 1...,0


In [48]:
from sklearn.cluster import KMeans

# Функция для преобразования заголовка в вектор
def title_to_vector(title, model):
    words = title.split()
    vector = np.zeros((100,))
    count = 0
    for word in words:
        if word in model.wv:
            vector += model.wv[word]
            count += 1
    if count != 0:
        vector /= count
    return vector

# Преобразование каждого заголовка в вектор и добавление его в датасет
vectors = []
for title in train_df['title']:
    vector = title_to_vector(title, word2vec)
    vectors.append(vector)

# Удаление столбца "title" из датасета
train_df.drop(columns=['title'], inplace=True)

# Создание новых столбцов в датасете для значений векторов
vector_columns = ['vector_' + str(i) for i in range(100)]
vectors_df = pd.DataFrame(vectors, columns=vector_columns)
train_df = pd.concat([train_df, vectors_df], axis=1)

In [49]:
train_df

Unnamed: 0,url,label,vector_0,vector_1,vector_2,vector_3,vector_4,vector_5,vector_6,vector_7,...,vector_90,vector_91,vector_92,vector_93,vector_94,vector_95,vector_96,vector_97,vector_98,vector_99
0,m.kp.md,0,0.676041,0.065785,-0.191820,-0.433014,-0.393672,0.020739,0.927764,0.663100,...,-1.078901,-0.502867,-0.817467,-0.037662,0.599401,-0.370866,0.057723,-0.624762,0.088301,0.689797
1,www.kp.by,0,-0.581977,0.543200,1.510469,0.695105,-0.726763,-0.373752,1.814184,0.662512,...,0.106814,0.048622,-0.339605,-1.216942,1.045872,0.571090,-0.260090,-1.063345,-0.281846,-1.297636
2,fanserials.tv,0,2.229805,-0.413835,-1.127161,0.115124,1.205634,0.614312,1.236800,-0.812255,...,-0.233098,0.288134,-0.561879,1.021920,-0.328090,-0.133717,-1.109038,-0.977852,3.499163,0.901050
3,colorbox.spb.ru,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,tula-sport.ru,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135304,mail.ru,0,1.272296,-0.825290,-0.470489,0.128453,0.002588,0.629224,0.143605,-0.599786,...,-0.366811,-0.832376,-0.716082,0.668852,0.313148,1.754275,-0.543300,-2.722169,1.422642,0.395784
135305,www.ntv.ru,0,1.566752,-0.224173,-2.152102,-0.159712,1.193683,1.459856,-0.161596,-0.345939,...,-0.661160,-0.063762,-0.876410,1.069943,0.537569,1.370544,0.255585,-1.475005,3.187026,-0.852214
135306,topclassiccarsforsale.com,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
135307,wowcream.ru,0,-2.809181,-0.327173,0.932539,-0.124244,-1.552274,-1.682815,-1.856069,2.610056,...,-0.199786,0.759500,-0.067844,0.513919,0.847635,-0.998376,-0.586462,-1.002704,-1.186508,1.992705


In [50]:
train_df.drop(columns=['url'], inplace=True)
train_df.drop(columns=['label'], inplace=True)

In [51]:
train_df

Unnamed: 0,vector_0,vector_1,vector_2,vector_3,vector_4,vector_5,vector_6,vector_7,vector_8,vector_9,...,vector_90,vector_91,vector_92,vector_93,vector_94,vector_95,vector_96,vector_97,vector_98,vector_99
0,0.676041,0.065785,-0.191820,-0.433014,-0.393672,0.020739,0.927764,0.663100,1.164986,-0.681825,...,-1.078901,-0.502867,-0.817467,-0.037662,0.599401,-0.370866,0.057723,-0.624762,0.088301,0.689797
1,-0.581977,0.543200,1.510469,0.695105,-0.726763,-0.373752,1.814184,0.662512,-3.549309,-2.136065,...,0.106814,0.048622,-0.339605,-1.216942,1.045872,0.571090,-0.260090,-1.063345,-0.281846,-1.297636
2,2.229805,-0.413835,-1.127161,0.115124,1.205634,0.614312,1.236800,-0.812255,-3.833474,-0.251556,...,-0.233098,0.288134,-0.561879,1.021920,-0.328090,-0.133717,-1.109038,-0.977852,3.499163,0.901050
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135304,1.272296,-0.825290,-0.470489,0.128453,0.002588,0.629224,0.143605,-0.599786,-1.868735,-0.552414,...,-0.366811,-0.832376,-0.716082,0.668852,0.313148,1.754275,-0.543300,-2.722169,1.422642,0.395784
135305,1.566752,-0.224173,-2.152102,-0.159712,1.193683,1.459856,-0.161596,-0.345939,-2.653684,-0.064280,...,-0.661160,-0.063762,-0.876410,1.069943,0.537569,1.370544,0.255585,-1.475005,3.187026,-0.852214
135306,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
135307,-2.809181,-0.327173,0.932539,-0.124244,-1.552274,-1.682815,-1.856069,2.610056,0.914322,-1.271094,...,-0.199786,0.759500,-0.067844,0.513919,0.847635,-0.998376,-0.586462,-1.002704,-1.186508,1.992705


In [53]:
kmeans = KMeans(n_clusters=2)

kmeans.fit(train_df)

# Предсказание и сохранение меток
predictions = kmeans.predict(train_df)
train_df['predicted_label'] = predictions

# Результат
print(train_df.head())

   vector_0  vector_1  vector_2  vector_3  vector_4  vector_5  vector_6  \
0  0.676041  0.065785 -0.191820 -0.433014 -0.393672  0.020739  0.927764   
1 -0.581977  0.543200  1.510469  0.695105 -0.726763 -0.373752  1.814184   
2  2.229805 -0.413835 -1.127161  0.115124  1.205634  0.614312  1.236800   
3  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

   vector_7  vector_8  vector_9  ...  vector_91  vector_92  vector_93  \
0  0.663100  1.164986 -0.681825  ...  -0.502867  -0.817467  -0.037662   
1  0.662512 -3.549309 -2.136065  ...   0.048622  -0.339605  -1.216942   
2 -0.812255 -3.833474 -0.251556  ...   0.288134  -0.561879   1.021920   
3  0.000000  0.000000  0.000000  ...   0.000000   0.000000   0.000000   
4  0.000000  0.000000  0.000000  ...   0.000000   0.000000   0.000000   

   vector_94  vector_95  vector_96  vector_97  vector_98  vector_99  \
0   0.599401  -0.370866   0.057723  -0.

In [54]:
train_df

Unnamed: 0,vector_0,vector_1,vector_2,vector_3,vector_4,vector_5,vector_6,vector_7,vector_8,vector_9,...,vector_91,vector_92,vector_93,vector_94,vector_95,vector_96,vector_97,vector_98,vector_99,predicted_label
0,0.676041,0.065785,-0.191820,-0.433014,-0.393672,0.020739,0.927764,0.663100,1.164986,-0.681825,...,-0.502867,-0.817467,-0.037662,0.599401,-0.370866,0.057723,-0.624762,0.088301,0.689797,0
1,-0.581977,0.543200,1.510469,0.695105,-0.726763,-0.373752,1.814184,0.662512,-3.549309,-2.136065,...,0.048622,-0.339605,-1.216942,1.045872,0.571090,-0.260090,-1.063345,-0.281846,-1.297636,1
2,2.229805,-0.413835,-1.127161,0.115124,1.205634,0.614312,1.236800,-0.812255,-3.833474,-0.251556,...,0.288134,-0.561879,1.021920,-0.328090,-0.133717,-1.109038,-0.977852,3.499163,0.901050,1
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135304,1.272296,-0.825290,-0.470489,0.128453,0.002588,0.629224,0.143605,-0.599786,-1.868735,-0.552414,...,-0.832376,-0.716082,0.668852,0.313148,1.754275,-0.543300,-2.722169,1.422642,0.395784,1
135305,1.566752,-0.224173,-2.152102,-0.159712,1.193683,1.459856,-0.161596,-0.345939,-2.653684,-0.064280,...,-0.063762,-0.876410,1.069943,0.537569,1.370544,0.255585,-1.475005,3.187026,-0.852214,1
135306,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
135307,-2.809181,-0.327173,0.932539,-0.124244,-1.552274,-1.682815,-1.856069,2.610056,0.914322,-1.271094,...,0.759500,-0.067844,0.513919,0.847635,-0.998376,-0.586462,-1.002704,-1.186508,1.992705,0


In [55]:
f1_score(y_true=y_true, y_pred=predictions)

0.25634935629267763