In [1]:
import scipy.sparse
import json
import string
import pymorphy2
import gc
import gensim.models.keyedvectors as word2vec

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from fse.models import SIF

from tqdm import tqdm_notebook
from multiprocessing import Pool, cpu_count

In [2]:
path = '../../data/external/ruwikiruscorpora_upos_skipgram_300_2_2019/model.bin'
w2v_model = word2vec.KeyedVectors.load_word2vec_format(path, binary=True)

In [3]:
punctuation = string.punctuation + '«»\n--––'
mapping = str.maketrans(punctuation, ' ' * len(punctuation))
ma = pymorphy2.MorphAnalyzer()

def normalize_text(s):
    return " ".join(
        [ma.normal_forms(word)[0] for word in s.translate(mapping).lower().split()]
    )

def normalize_line(line):
    item = json.loads(line)
    item['content'] = normalize_text(item['content'])
    item['title'] = normalize_text(item['title'])
    if isinstance(item['image'], float):
        item['image'] = np.full((96,),0)
    else:
        item['image'] = np.array(item['image'])
    return item

In [4]:
items = pd.read_csv('../../data/processed/processed_items.csv', index_col='itemId')

In [5]:
# with open('items.json') as items_json:
#     with Pool(cpu_count()) as pool:
#         items_json_list = list(pool.imap(normalize_line, items_json))
        
# items = pd.DataFrame(items_json_list)
# items.set_index('itemId')
items.head()

Unnamed: 0_level_0,content,image,title
itemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,согласиться дорогой любитель собака до что же ...,[-0.169 0.129 0.067 0.019 0.281 -0.245 0....,пять забавный морщинистый порода собака
1,контур три поперечный улица состоять до недавн...,[-0.158 -0.112 -0.325 0.05 -0.114 0.002 -0....,история улица ирининский в гомель
2,источник http infodays ru вообще он как то сам...,[ 0.084 -0.181 0.008 0.34 -0.03 -0.197 -0....,зачем дудь весь время спрашивать гость програм...
3,41 летний светлана зейналов решить окрестить 5...,[ 0.034 -0.119 -0.062 0.025 0.128 -0.041 0....,светлана зейналов крестить младший дочь
4,организовать преступный группировка гбао делат...,[-0.061 -0.015 -0.198 -0.047 0.054 0.029 -0....,гкнб бандит в гбао делать вид что расстаться с...


In [6]:
import nltk
nltk.download('stopwords')
#--------#

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/valeriy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
items['title'] = items['title'].str.split()
# items['content'] = items['content'].str.split()

In [8]:
titles = list(items['title'].values)

In [9]:
from pymystem3 import Mystem

conversion_table = {
    'A': 'ADJ',
    'ADV': 'ADV',
    'ADVPRO': 'ADV',
    'ANUM': 'ADJ',
    'APRO': 'DET',
    'COM': 'ADJ',
    'CONJ': 'SCONJ',
    'INTJ': 'INTJ',
    'NONLEX': 'X',
    'NUM': 'NUM',
    'PART': 'PART',
    'PR': 'ADP',
    'S': 'NOUN',
    'SPRO': 'PRON',
    'UNKN': 'X',
    'V': 'VERB'
}

m = Mystem()

def tag(word='пожар'):
    processed = m.analyze(word)[0]
    if 'analysis' not in processed or not processed["analysis"]:
        return None
    lemma = processed["analysis"][0]["lex"].lower().strip()
    pos = processed["analysis"][0]["gr"].split(',')[0]
    pos = pos.split('=')[0].strip()
    pos = conversion_table.get(pos)
    tagged = lemma + '_' + pos
    return tagged

In [10]:
russian_stopwords = set(stopwords.words("russian"))

In [11]:
from collections import defaultdict

In [12]:
sif = defaultdict(int)
total_words = 0

for title in tqdm_notebook(titles):
    if isinstance(title, float):
        continue
    for word in title:
        tagged = tag(word)
        total_words += 1
        if tagged not in w2v_model or word in russian_stopwords:
            continue
        else:
            tagged_id = w2v_model.wv.vocab[tagged].index
            sif[tagged_id] += 1
sif = {word_id: num_occur / total_words for word_id, num_occur in sif.items()}

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=328050.0), HTML(value='')))

  del sys.path[0]





In [13]:
gc.collect()

11

In [14]:
len(sif)

35334

In [15]:
def sif_embeddings(sentences, model, alpha=1e-3):
    """ Precomputes the indices of the sentences and uses the numpy indexing 
        to directly multiply and sum the vectors
    """
    vlookup = model.wv.vocab
    vectors = model.wv
    output = []
    for s in tqdm_notebook(sentences):
        if isinstance(s, float):
            output.append(np.zeros((300,)))
            continue
        # Pre-compute sentence indices
        idx = [w2v_model.wv.vocab[tag(w)].index for w in s if tag(w) in w2v_model.wv.vocab]
        # Note: vectors.sif is a pre-computed numpy array containing the weights for all the word-vectors.
        weights = np.array([sif.get(word_id, 0) for word_id in idx])
        v = weights @ w2v_model.wv.vectors[idx]
        words_num = len(idx)
        words_num -= np.sum(weights == 0)
        if words_num:
            v /= words_num
        else:
            v *= 0
        output.append(v)
    return np.vstack(output).astype(np.float32)

In [16]:
title_embs = sif_embeddings(titles, w2v_model)

items_num = items.shape[0]
del titles, items, sif, w2v_model
gc.collect()

  """
  
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=328050.0), HTML(value='')))

  del sys.path[0]
  app.launch_new_instance()





4

In [5]:
title_embs = np.load('title_embeddings.np.npy')
title_embs.shape

(328050, 300)

In [12]:
title_embs_w2v = np.concatenate((title_embs, np.zeros((1, 300))))
np.save('title_embeddings_w2v', title_embs_w2v)

In [17]:
item_features = scipy.sparse.hstack((scipy.sparse.eye(items_num), 
                                     scipy.sparse.csr_matrix(title_embs)),
                                    format='csr')

In [8]:
data = []
row = []
col = []

train_lines = sum(1 for line in open('train.json','r'))

with open('train.json') as train_file:
    for i, line in enumerate(tqdm_notebook(train_file, total=train_lines)):
        json_line = json.loads(line)
        for item, rating in json_line['trainRatings'].items():
            data.append(2 * int(rating) - 1)
            row.append(i)
            col.append(int(item))
train_int = scipy.sparse.coo_matrix((data, (row, col)))
del data, row, col
gc.collect()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=42977.0), HTML(value='')))




132

In [21]:
scipy.sparse.save_npz('item_features_embedding.npz', item_features)

In [2]:
item_features = scipy.sparse.load_npz("item_features_embedding.npz")

In [3]:
item_features.shape

(328050, 328350)

In [6]:
import lightfm

In [None]:
model = lightfm.LightFM(no_components=64, loss='logistic', learning_schedule='adadelta', random_state=42)
model.fit(train_int, epochs=7, num_threads=cpu_count(), item_features=item_features, verbose=True)

Epoch 0


In [None]:
sample = pd.read_csv('random_benchmark.csv')
sample['pred'] = model.predict(
    sample.userId.values,
    sample.itemId.values,
    item_features=item_features,
    num_threads=cpu_count(),
)
sample.sort_values(['userId', 'pred'], ascending=[True, False], inplace=True)
sample.drop(columns=['pred'], inplace=True)
sample.to_csv('lightfm_title_embedding_log.csv', index=False)

In [None]:
!kaggle competitions submit -c 2018-hse-ml-competition-04 -f lightfm_title_embedding_log.csv -m "Title embedding log loss 5 epochs no_components=64"