In [1]:
import scipy.sparse
import json
import string
import pymorphy2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm import tqdm_notebook
from multiprocessing import Pool, cpu_count

In [2]:
items = pd.read_csv('../data/processed/processed_items.csv', index_col='itemId')
items.head()

Unnamed: 0_level_0,content,image,title
itemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,согласиться дорогой любитель собака до что же ...,[-0.169 0.129 0.067 0.019 0.281 -0.245 0....,пять забавный морщинистый порода собака
1,контур три поперечный улица состоять до недавн...,[-0.158 -0.112 -0.325 0.05 -0.114 0.002 -0....,история улица ирининский в гомель
2,источник http infodays ru вообще он как то сам...,[ 0.084 -0.181 0.008 0.34 -0.03 -0.197 -0....,зачем дудь весь время спрашивать гость програм...
3,41 летний светлана зейналов решить окрестить 5...,[ 0.034 -0.119 -0.062 0.025 0.128 -0.041 0....,светлана зейналов крестить младший дочь
4,организовать преступный группировка гбао делат...,[-0.061 -0.015 -0.198 -0.047 0.054 0.029 -0....,гкнб бандит в гбао делать вид что расстаться с...


In [3]:
import gensim.models.keyedvectors as word2vec
import gc

In [4]:
path = '../data/external/ruwikiruscorpora_upos_skipgram_300_2_2019/model.bin'
w2v_model = word2vec.KeyedVectors.load_word2vec_format(path, binary=True)

In [5]:
import sys
sys.path.append('../src/features')
from w2v_stemmer import tag

In [7]:
embeddings_index = dict()
for word in tqdm_notebook(w2v_model.vocab):
    embeddings_index[word] = w2v_model[word]
print('Loaded {} word vectors.'.format(len(embeddings_index)))

gc.collect()
all_embs = np.stack(list(embeddings_index.values()))
emb_mean, emb_std = all_embs.mean(axis=0), all_embs.std(axis=0)

HBox(children=(IntProgress(value=0, max=248978), HTML(value='')))


Loaded 248978 word vectors.


In [10]:
import nltk
nltk.download("stopwords")
#--------#

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/valeriy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
russian_stopwords = stopwords.words("russian")

def get_embedding(label: str) -> np.ndarray:
    if isinstance(label, float):
        return np.random.normal(emb_mean, emb_std, w2v_model.vector_size)
    
    words = [word for word in label.split() if word not in russian_stopwords]
    
    if not words:
        return np.random.normal(emb_mean, emb_std, w2v_model.vector_size)
    
    label_embeddings = np.zeros((len(words), w2v_model.vector_size), dtype=np.float32)
    
    for i, word in enumerate(words):
        if word not in w2v_model:
            label_embeddings[i, :] = np.random.normal(emb_mean, emb_std, w2v_model.vector_size)
            continue
        tagged = tag(word)
        if tagged:
            label_embeddings[i, :] = w2v_model[tagged]
            continue
        label_embeddings[i, :] = np.random.normal(emb_mean, emb_std, w2v_model.vector_size)
    
    return label_embeddings.mean()
            

def get_embeddings(series: pd.Series):
    output = np.zeros((len(series), w2v_model.vector_size), dtype=np.float32)
    for i, label in tqdm_notebook(series.items(), total=len(series)):
        output[i, :] = get_embedding(label)
    return output

In [12]:
title_embeddings = get_embeddings(items['title'])
title_embeddings.shape

HBox(children=(IntProgress(value=0, max=328050), HTML(value='')))




(328050, 300)

In [13]:
item_features = scipy.sparse.hstack([
    scipy.sparse.eye(len(items)), title_embeddings
], format='csr')

In [14]:
data = []
row = []
col = []

train_lines = sum(1 for line in open('../data/interim/train.json','r'))

with open('../data/interim/train.json') as train_file:
    for i, line in enumerate(tqdm_notebook(train_file, total=train_lines)):
        json_line = json.loads(line)
        for item, rating in json_line['trainRatings'].items():
            data.append((-1) ** (int(rating) + 1))
            row.append(i)
            col.append(int(item))
train_int = scipy.sparse.coo_matrix((data, (row, col)))
del data, row, col

HBox(children=(IntProgress(value=0, max=42977), HTML(value='')))




In [15]:
import lightfm

In [16]:
model = lightfm.LightFM(no_components=32, loss="logistic", random_state=42)
model.fit(train_int, epochs=10, num_threads=cpu_count(), item_features=item_features, verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9


<lightfm.lightfm.LightFM at 0x7f3c4c6e0780>

In [17]:
sample = pd.read_csv('../data/external/random_benchmark.csv')
sample['pred'] = model.predict(
    sample.userId.values,
    sample.itemId.values,
    item_features=item_features,
    num_threads=cpu_count(),
)
sample.sort_values(['userId', 'pred'], ascending=[True, False], inplace=True)
sample.drop(columns=['pred'], inplace=True)
sample.to_csv('../predictions/lightfm_title_embeddings3.csv', index=False)

In [18]:
import os
os.environ['KAGGLE_USERNAME'] = "klyukinv"
os.environ['KAGGLE_KEY'] = "6fc43bd58892df21aa88d9f8ad6206ef"

In [19]:
!kaggle competitions submit -c 2018-hse-ml-competition-04 -f ../predictions/lightfm_title_embeddings3.csv -m "Title embeddings (w2v) submission №3"

100%|██████████████████████████████████████| 35.7M/35.7M [00:08<00:00, 4.38MB/s]
Successfully submitted to Рекомендательная система для статей