In [1]:
import scipy.sparse
import json
import string
import pymorphy2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook
from multiprocessing import Pool, cpu_count

In [2]:
items = pd.read_csv('../data/processed/processed_items.csv', index_col='itemId')
items.head()

Unnamed: 0_level_0,content,image,title
itemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,согласиться дорогой любитель собака до что же ...,[-0.169 0.129 0.067 0.019 0.281 -0.245 0....,пять забавный морщинистый порода собака
1,контур три поперечный улица состоять до недавн...,[-0.158 -0.112 -0.325 0.05 -0.114 0.002 -0....,история улица ирининский в гомель
2,источник http infodays ru вообще он как то сам...,[ 0.084 -0.181 0.008 0.34 -0.03 -0.197 -0....,зачем дудь весь время спрашивать гость програм...
3,41 летний светлана зейналов решить окрестить 5...,[ 0.034 -0.119 -0.062 0.025 0.128 -0.041 0....,светлана зейналов крестить младший дочь
4,организовать преступный группировка гбао делат...,[-0.061 -0.015 -0.198 -0.047 0.054 0.029 -0....,гкнб бандит в гбао делать вид что расстаться с...


In [3]:
ratings = []
users = []
items = []

train_lines = sum(1 for line in open('../data/interim/train.json','r'))

with open('../data/interim/train.json') as train_file:
    for i, line in enumerate(tqdm_notebook(train_file, total=train_lines)):
        json_line = json.loads(line)
        for item, rating in json_line['trainRatings'].items():
            ratings.append(rating)
            users.append(i)
            items.append(int(item))

train_data = pd.DataFrame({'userId': users, 'itemId': items, 'rating': ratings})
train_data.head()

HBox(children=(IntProgress(value=0, max=42977), HTML(value='')))




Unnamed: 0,userId,itemId,rating
0,0,206495,0
1,0,279694,0
2,0,19718,0
3,0,74707,0
4,0,221548,0


In [7]:
del ratings
del users
del items

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [6]:
user_enc = LabelEncoder()
train_data['user'] = user_enc.fit_transform(train_data['userId'].values)
n_users = train_data['user'].nunique()

item_enc = LabelEncoder()
train_data['item'] = item_enc.fit_transform(train_data['itemId'].values)
n_movies = train_data['item'].nunique()

train_data['rating'] = train_data['rating'].values.astype(np.float32)
min_rating = min(train_data['rating'])
max_rating = max(train_data['rating'])

n_users, n_movies, min_rating, max_rating

(42977, 242356, 0.0, 1.0)

In [12]:
X = train_data[['user', 'item']].values
y = train_data['rating'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((61002151, 2), (6778017, 2), (61002151,), (6778017,))

In [13]:
n_factors = 30

X_train_array = [X_train[:, 0], X_train[:, 1]]
X_test_array = [X_test[:, 0], X_test[:, 1]]

In [28]:
from keras.models import Model
from keras.layers import Input, Reshape, Dot
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.metrics import binary_accuracy
from keras.callbacks import EarlyStopping

def RecommenderV1(n_users, n_movies, n_factors):
    user = Input(shape=(1,))
    u = Embedding(n_users, n_factors, embeddings_initializer='he_normal',
                  embeddings_regularizer=l2(1e-6))(user)
    u = Reshape((n_factors,))(u)
    
    movie = Input(shape=(1,))
    m = Embedding(n_movies, n_factors, embeddings_initializer='he_normal',
                  embeddings_regularizer=l2(1e-6))(movie)
    m = Reshape((n_factors,))(m)
    
    x = Dot(axes=1)([u, m])

    model = Model(inputs=[user, movie], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[binary_accuracy])

    return model

In [29]:
model = RecommenderV1(n_users, n_movies, n_factors)
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 1, 30)        1289310     input_5[0][0]                    
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 1, 30)        7270680     input_6[0][0]                    
____________________________________________________________________________________________

In [31]:
import warnings
warnings.filterwarnings('ignore')

In [37]:
history = model.fit(x=X_train_array, y=y_train, batch_size=64, epochs=10,
                    verbose=1, validation_data=(X_test_array, y_test), 
                    callbacks=[EarlyStopping(patience=3)])

Train on 61002151 samples, validate on 6778017 samples
Epoch 1/10
  457664/61002151 [..............................] - ETA: 29:56:58 - loss: 1.4236 - binary_accuracy: 0.9056

KeyboardInterrupt: 