In [1]:
import scipy.sparse
import json
import string
import pymorphy2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm import tqdm_notebook
from multiprocessing import Pool, cpu_count

In [2]:
items = pd.read_csv('../data/processed/processed_items.csv', index_col='itemId')
items.head()

Unnamed: 0_level_0,content,image,title
itemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,согласиться дорогой любитель собака до что же ...,[-0.169 0.129 0.067 0.019 0.281 -0.245 0....,пять забавный морщинистый порода собака
1,контур три поперечный улица состоять до недавн...,[-0.158 -0.112 -0.325 0.05 -0.114 0.002 -0....,история улица ирининский в гомель
2,источник http infodays ru вообще он как то сам...,[ 0.084 -0.181 0.008 0.34 -0.03 -0.197 -0....,зачем дудь весь время спрашивать гость програм...
3,41 летний светлана зейналов решить окрестить 5...,[ 0.034 -0.119 -0.062 0.025 0.128 -0.041 0....,светлана зейналов крестить младший дочь
4,организовать преступный группировка гбао делат...,[-0.061 -0.015 -0.198 -0.047 0.054 0.029 -0....,гкнб бандит в гбао делать вид что расстаться с...


In [3]:
import gensim.models.keyedvectors as word2vec
import gc

In [4]:
path = '../data/external/ruwikiruscorpora_upos_skipgram_300_2_2019/model.bin'
w2v_model = word2vec.KeyedVectors.load_word2vec_format(path, binary=True)

In [5]:
import sys
sys.path.append('../src/features')
from w2v_stemmer import tag

In [6]:
embeddings_index = dict()
for word in tqdm_notebook(w2v_model.vocab):
    embeddings_index[word] = w2v_model[word]
print('Loaded {} word vectors.'.format(len(embeddings_index)))

gc.collect()
all_embs = np.stack(list(embeddings_index.values()))
emb_mean, emb_std = all_embs.mean(axis=0), all_embs.std(axis=0)

HBox(children=(IntProgress(value=0, max=248978), HTML(value='')))


Loaded 248978 word vectors.


In [7]:
import nltk
nltk.download("stopwords")
#--------#

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/valeriy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
russian_stopwords = stopwords.words("russian")

def get_embedding(label: str) -> np.ndarray:
    if isinstance(label, float):
        return np.random.normal(emb_mean, emb_std, w2v_model.vector_size)
    
    words = [word for word in label.split() if word not in russian_stopwords]
    
    if not words:
        return np.random.normal(emb_mean, emb_std, w2v_model.vector_size)
    
    label_embeddings = np.zeros((len(words), w2v_model.vector_size), dtype=np.float32)
    
    for i, word in enumerate(words):
        if word not in w2v_model:
            label_embeddings[i, :] = np.random.normal(emb_mean, emb_std, w2v_model.vector_size)
            continue
        tagged = tag(word)
        if tagged:
            label_embeddings[i, :] = w2v_model[tagged]
            continue
        label_embeddings[i, :] = np.random.normal(emb_mean, emb_std, w2v_model.vector_size)
    
    return label_embeddings.mean()
            

def get_embeddings(series: pd.Series):
    output = np.zeros((len(series), w2v_model.vector_size), dtype=np.float32)
    for i, label in tqdm_notebook(series.items(), total=len(series)):
        output[i, :] = get_embedding(label)
    return output

In [9]:
!ls

hse_iad_competition	      lin_reg_embedding_text_sif.ipynb
lightfm_embedding_text.ipynb  neural_collaborative_filtering.ipynb
linear_recommender.ipynb      perfect_solution_sort.ipynb
lin_reg_embedding_text.ipynb


In [13]:
title_embeddings = scipy.sparse.load_npz('hse_iad_competition/item_features_embedding.npz')

In [17]:
sample = pd.read_csv('../data/external/random_benchmark.csv')
sample.userId.unique().size

4349

In [18]:
users_to_pred = sample.userId.unique()
len(users_to_pred)

4349

In [19]:
train_data = {}

train_lines = sum(1 for line in open('../data/interim/train.json','r'))

with open('../data/interim/train.json') as train_file:
    for i, line in enumerate(tqdm_notebook(train_file, total=train_lines)):
        if i in users_to_pred:
            json_line = json.loads(line)
            answers = []
            items = []
            for item, rating in json_line['trainRatings'].items():
                answers.append(2 * int(rating) - 1)
                items.append(int(item))
            train_data[i] = {
                'X': items,
                'y': answers
            }

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=42977.0), HTML(value='')))




In [20]:
def learn_logistic_regression(userId, sample):
    train_items = train_data[userId]['X']
    y_train = np.array(train_data[userId]['y'])
    X_train = title_embeddings[train_items]
    model = LogisticRegression(random_state=42, C=0.01)
    model.fit(X_train, y_train)
    items_to_predict = sample[sample.userId == userId].itemId
    sample.loc[sample.userId == userId, 'pred'] = model.predict_proba(title_embeddings[items_to_predict])[:, 1]

In [21]:
sample['pred'] = np.zeros(sample.userId.shape)
sample.head()

Unnamed: 0,userId,itemId,pred
0,1,242249,0.0
1,1,117266,0.0
2,1,123441,0.0
3,1,148575,0.0
4,1,155695,0.0


In [22]:
import warnings
warnings.filterwarnings("ignore")

In [23]:
for userId in tqdm_notebook(users_to_pred):
    learn_logistic_regression(userId, sample)

HBox(children=(FloatProgress(value=0.0, max=4349.0), HTML(value='')))




In [24]:
sample.describe()

Unnamed: 0,userId,itemId,pred
count,3018186.0,3018186.0,3018186.0
mean,21511.73,163282.3,0.09016364
std,12126.6,94540.55,0.07101892
min,1.0,3.0,0.009543281
25%,10987.0,80790.0,0.03985582
50%,21584.0,163792.0,0.06875905
75%,31738.0,244955.0,0.1167542
max,42975.0,328049.0,0.5056871


In [25]:
sample.sort_values(['userId', 'pred'], ascending=[True, False], inplace=True)
sample.drop(columns=['pred'], inplace=True)
sample.to_csv('../predictions/log_reg_embedding_title_sif.csv', index=False)

In [26]:
!kaggle competitions submit -c 2018-hse-ml-competition-04 -f ../predictions/log_reg_embedding_title_sif.csv -m "Log reg title embeddings (w2v) submission №1"

100%|██████████████████████████████████████| 35.7M/35.7M [00:07<00:00, 5.12MB/s]
Successfully submitted to Рекомендательная система для статей

In [91]:
from sklearn.svm import SVC

def learn_svm(userId, sample):
    train_items = train_data[userId]['X']
    y_train = np.array(train_data[userId]['y'])
    X_train = title_embeddings[train_items]
    model = SVC(random_state=42, probability=True)
    model.fit(X_train, y_train)
    items_to_predict = sample[sample.userId == userId].itemId
    sample.loc[sample.userId == userId, 'pred'] = model.predict_proba(title_embeddings[items_to_predict])[:, 1]

In [92]:
for userId in tqdm_notebook(users_to_pred):
    learn_svm(userId, sample)

HBox(children=(IntProgress(value=0, max=4349), HTML(value='')))

In [93]:
sample.sort_values(['userId', 'pred'], ascending=[True, False], inplace=True)
sample.drop(columns=['pred'], inplace=True)
sample.to_csv('../predictions/svm_embedding_text_submission.csv', index=False)

In [96]:
!kaggle competitions submit -c 2018-hse-ml-competition-04 -f ../predictions/svm_embedding_text_submission.csv -m "SVM title embeddings (w2v) submission"

100%|██████████████████████████████████████| 35.7M/35.7M [00:06<00:00, 5.58MB/s]
Successfully submitted to Рекомендательная система для статей