In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet


import warnings; warnings.simplefilter('ignore')

# 1. Collaborative Filtering


In [2]:
train_dat = pd.read_csv(os.getcwd() + '/train.csv')

In [3]:
meta_dat = pd.read_csv(os.getcwd() + '/books_metadata.csv')

In [4]:
meta_dat.dropna(subset=['item_id'], inplace=True)


In [5]:
CountsOfReview = meta_dat[meta_dat['CountsOfReview'].notnull()]['CountsOfReview'].astype('int')
Rating = meta_dat[meta_dat['Rating'].notnull()]['Rating'].astype('int')
C = Rating.mean()
C

3.368927466020724

In [6]:
m = CountsOfReview.quantile(0.2)
m

2.0

In [7]:
qualified = meta_dat[(meta_dat['CountsOfReview'] >= m) & (meta_dat['CountsOfReview'].notnull()) & (meta_dat['Rating'].notnull())][['Name', 'pagesNumber', 'Publisher', 'CountsOfReview', 'PublishYear', 'Language','Authors','Rating','item_id']]
qualified['CountsOfReview'] = qualified['CountsOfReview'].astype('int')
qualified['Rating'] = qualified['Rating'].astype('float')
qualified['item_id'] = qualified['item_id'].astype('int')

In [8]:
def weighted_rating(x):
    v = x['CountsOfReview']
    R = x['Rating']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [9]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)
qualified = qualified.sort_values('wr', ascending=False)

In [10]:
qualified['Name2'] = qualified['Name']

In [11]:
qualified['Name'] = qualified['Name'].astype('string')
qualified['Publisher'] = qualified['Publisher'].astype('string')
qualified['Language'] = qualified['Language'].astype('string')
qualified['Authors'] = qualified['Authors'].astype('string')

In [12]:
qualified['Name'] = qualified['Name'].apply(lambda x: str.lower(x.replace(" ", "")))
qualified['Publisher'] = qualified['Publisher'].fillna('').apply(lambda x: str.lower(x.replace(" ", "")))
qualified['Language'] = qualified['Language'].fillna('').apply(lambda x: str.lower(x.replace(" ", "")))
qualified['Authors'] = qualified['Authors'].fillna('').apply(lambda x: str.lower(x.replace(" ", "")))

In [13]:
import string

qualified['Name'] = qualified['Name'].fillna('').apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
qualified['Publisher'] = qualified['Publisher'].fillna('').apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
qualified['Language'] = qualified['Language'].fillna('').apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
qualified['Authors'] = qualified['Authors'].fillna('').apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [14]:
qualified['soup'] = qualified['Name'] + ' ' + qualified['Publisher'] + ' ' + qualified['Language'] + ' '+ qualified['Authors']

In [15]:
qualified['soup']

14698      thecompletecalvinandhobbes andrewsmcmeelpublis...
54922      markoftheliontrilogy tyndalehouse eng francine...
6          harrypotterboxedsetbooks15harrypotter15 schola...
32654                todamafalda edicionesdelaflor spa quino
1593875    harrypotterseriesboxsetharrypotter17 arthurale...
                                 ...                        
372693     sexandthejapanesethesensualsideofjapan tuttlep...
385718                mrsgod simonschusteraudio  peterstraub
1006034               کافهیپریدریایی نشرچشمه per میتراالیاتی
20383      citizengirl washingtonsquarepress enus emmamcl...
642615       somersvsomers severnhousepublishers  julieellis
Name: soup, Length: 37107, dtype: object

In [16]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(qualified['soup'])

In [17]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [18]:
qualified = qualified.reset_index()
book_titles = qualified['Name2']
indices = pd.Series(qualified.index, index=qualified['Name2'])

In [19]:
def get_recommendations(book):
    idx = indices[book]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    book_indices = [i[0] for i in sim_scores]
    return book_titles.iloc[book_indices]

In [20]:
get_recommendations("One Night at the Call Center")

36905                            The 3 Mistakes of My Life
37098                         One Night at the Call Centre
10551                       Black House (The Talisman, #2)
22286             Taltos (Lives of The Mayfair Witches #3)
25037             Taltos (Lives of the Mayfair Witches #3)
26       J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
10541                 A Kiss of Shadows (Merry Gentry, #1)
23841                                  Expedition To Earth
27865                               The View from Serendip
31987                    1984, Spring: A Choice of Futures
7899       The Vampire Lestat (The Vampire Chronicles, #2)
11444    Interview with the Vampire (The Vampire Chroni...
16730    The Queen of the Damned (The Vampire Chronicle...
18925                                        Cry to Heaven
20383                     The Mummy (Ramses the Damned #1)
23725      The Vampire Armand (The Vampire Chronicles, #6)
31213                   Out of Egypt (Christ the Lord, #

# 2.Collaborative Filtering


In [21]:
pip install scikit-surprise


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Users/k.c/opt/anaconda3/envs/py2/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [22]:
from surprise import Reader, Dataset, SVD, KNNBasic,SVDpp
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV,KFold
reader = Reader()

In [23]:
user_data = Dataset.load_from_df(train_dat[['user_id', 'item_id', 'rating']], reader)

In [24]:
# this part has been commented out as it takes quite a long time to run
'''
param_grid = {'n_factors':[50,100,150],'lr_all':[0.005,0.01,0.1], 'reg_all': [0.005,0.01,0.1]}
#kf = KFold(n_splits=5)
grid_search = GridSearchCV(SVDpp,param_grid,measures=['rmse','mae'],cv=5)
#trainset = user_data.build_full_trainset()
grid_search.fit(user_data)
'''

"\nparam_grid = {'n_factors':[50,100,150],'lr_all':[0.005,0.01,0.1], 'reg_all': [0.005,0.01,0.1]}\n#kf = KFold(n_splits=5)\ngrid_search = GridSearchCV(SVDpp,param_grid,measures=['rmse','mae'],cv=5)\n#trainset = user_data.build_full_trainset()\ngrid_search.fit(user_data)\n"

In [25]:
#best_params = grid_search.best_params

In [26]:
#best_params

In [27]:
algo = SVDpp(n_factors=50, lr_all=0.01,reg_all=0.1, reg_pu = 0.1, reg_qi = 0.1)
# Run 5-fold cross-validation and print results. It may take a while!
cross_validate(algo, user_data, measures=['RMSE', 'MAE'], cv=10, verbose=True)

Evaluating RMSE, MAE of algorithm SVDpp on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8796  0.8765  0.8831  0.8855  0.8833  0.8844  0.8890  0.8808  0.8831  0.8855  0.8831  0.0033  
MAE (testset)     0.6969  0.6978  0.7014  0.7030  0.7007  0.7017  0.7065  0.6988  0.6989  0.7046  0.7010  0.0029  
Fit time          97.73   98.75   99.32   100.70  100.62  101.42  99.47   100.44  101.14  102.11  100.17  1.26    
Test time         3.89    3.83    3.76    3.60    3.75    3.62    3.68    3.65    3.62    3.71    3.71    0.09    


{'test_rmse': array([0.87961197, 0.87651775, 0.88308696, 0.88552219, 0.88330817,
        0.88442277, 0.88898057, 0.88080085, 0.88309033, 0.88551531]),
 'test_mae': array([0.6969338 , 0.69784136, 0.701372  , 0.70302874, 0.70071503,
        0.70171965, 0.7065301 , 0.69876405, 0.69889765, 0.70460416]),
 'fit_time': (97.72826790809631,
  98.75346493721008,
  99.31804275512695,
  100.6995439529419,
  100.61679577827454,
  101.41831493377686,
  99.47268915176392,
  100.43784499168396,
  101.13804388046265,
  102.10948705673218),
 'test_time': (3.8909051418304443,
  3.8285250663757324,
  3.759868860244751,
  3.596759080886841,
  3.745759963989258,
  3.6192309856414795,
  3.6783809661865234,
  3.6499099731445312,
  3.6234359741210938,
  3.7107839584350586)}

In [28]:
trainset = user_data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x177000d90>

# 3. Hybrid Filter

In [29]:
#id_map = train_dat.merge(qualified[['Name2', 'item_id']], on='item_id').set_index('user_id')

In [30]:
test_dat = pd.read_csv(os.getcwd() + '/test.csv')

In [31]:
# Define a function to predict ratings using the content-based filtering method
def predict_content_based(user_id, item_id):
    matching_books = qualified[qualified['item_id'] == item_id]['Name2']
    
    if not matching_books.empty:
        book = matching_books.iloc[0]
        idx = indices[book]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:31]
        book_indices = [i[0] for i in sim_scores]

        # Calculate the weighted average of ratings of the top-30 similar books
        similar_books = qualified.iloc[book_indices]
        ratings_sum = sum(similar_books['CountsOfReview'] * similar_books['wr'])
        count_sum = sum(similar_books['CountsOfReview'])
        return ratings_sum / count_sum
    else:
        # Return the mean rating as a fallback if no matching book is found
        return qualified['wr'].mean()


In [32]:
# Set the weight parameter for the hybrid approach (0 to 1)
# Increase the weight to give more importance to collaborative filtering, and vice versa
collab_weight = 1

# Define a function to predict ratings using the hybrid approach
def predict_hybrid(user_id, item_id):
    content_based_rating = predict_content_based(user_id, item_id)
    collaborative_prediction = algo.predict(user_id, item_id)
    collaborative_rating = collaborative_prediction.est
    return (1 - collab_weight) * content_based_rating + collab_weight * collaborative_rating


# Apply the predict_hybrid function to the test_data DataFrame
test_dat['hybrid_predicted_rating'] = test_dat.apply(lambda x: predict_hybrid(x['user_id'], x['item_id']), axis=1)

In [33]:
#test_dat = test_dat.drop('rating', axis=1)

In [34]:
test_dat = test_dat.rename(columns={'hybrid_predicted_rating':'rating'})

In [35]:
kaggle_comp = test_dat[['ID','rating']]


In [36]:
kaggle_comp

Unnamed: 0,ID,rating
0,100000,4.298543
1,100001,4.436031
2,100002,3.510039
3,100003,4.663597
4,100004,3.917604
...,...,...
56194,156194,3.726539
56195,156195,3.407031
56196,156196,4.377438
56197,156197,3.107648


In [37]:
#kaggle_comp = kaggle_comp.drop('rating', axis=1)

In [38]:
kaggle_comp['rating'] = kaggle_comp['rating'].round().astype(int)


In [39]:
kaggle_comp.to_csv('kaggle_comp.csv', index = False)

# 4.Matrix Factorization


## 4.1 Matrix Factorization without bias

In [40]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import implicit
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
import scipy.sparse as sparse
import pandas as pd
import os
import numpy as np

In [41]:
from sklearn.model_selection import train_test_split

train_dat,val_dat = train_test_split(train_dat, test_size=0.1)

In [42]:
def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [43]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and item ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["user_id", "item_id"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [44]:
train_df = encode_data(train_dat)
val_df = encode_data(val_dat, train_dat)

In [45]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)
        
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)   

In [46]:
num_users = len(train_df.user_id.unique())
num_items = len(train_df.item_id.unique())
print(num_users, num_items)

4111 82876


In [47]:
mf_model = MF(num_users, num_items, emb_size=100)

In [48]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(train_df.user_id.values)
        items = torch.LongTensor(train_df.item_id.values)
        ratings = torch.FloatTensor(train_df.rating.values)
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item()) 
    return test_loss(model, unsqueeze)  # Return validation loss

In [49]:
def test_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(val_df.user_id.values)
    items = torch.LongTensor(val_df.item_id.values)
    ratings = torch.FloatTensor(val_df.rating.values)
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())
    return loss.item()  # Return test loss

In [50]:
train_epocs(mf_model, epochs=10, lr=0.1)
train_epocs(mf_model, epochs=15, lr=0.01)
train_epocs(mf_model, epochs=20, lr=0.01)

14.689668655395508
5.876476287841797
1.7981023788452148
4.586118221282959
0.9701341390609741
1.5058661699295044
2.8874874114990234
2.7863314151763916
1.59674072265625
0.7761574387550354
test loss 1.790 
1.3129481077194214
0.7776018977165222
0.5623425245285034
0.5453683137893677
0.6016244888305664
0.650524377822876
0.6631885766983032
0.6428499221801758
0.6050835251808167
0.5659394860267639
0.5362057089805603
0.5196804404258728
0.5142569541931152
0.51470547914505
0.5157244801521301
test loss 0.900 
0.5140005946159363
0.49513471126556396
0.4930191934108734
0.4702049195766449
0.46563073992729187
0.4689072072505951
0.46121951937675476
0.44821521639823914
0.4408150315284729
0.43833717703819275
0.4333813488483429
0.42365914583206177
0.41306930780410767
0.40499448776245117
0.3981654942035675
0.38947856426239014
0.378623366355896
0.36782246828079224
0.35833629965782166
0.3491195738315582
test loss 0.885 


0.8845368027687073

In [51]:
# encode the testing data
df_test_emb = encode_data(test_dat, train_dat)

# get the testing users and items
users = torch.LongTensor(df_test_emb.user_id.values) # .cuda()
items = torch.LongTensor(df_test_emb.item_id.values) # .cuda()

# predict from model
Y_mf = mf_model(users, items)


min_max = MinMaxScaler(feature_range=(1, 5))
res_scaled_mf = min_max.fit_transform(Y_mf.detach().numpy().reshape(-1, 1))[:,0]

# Find the indices that are missing in the original test data
missing_indices_mf = list(set(test_dat.index) - set(df_test_emb.index))

# Fill missing values with mean of predictions
mean_val_mf = np.mean(res_scaled_mf)
for index in missing_indices_mf:
    test_dat.loc[index, 'rating'] = mean_val_mf

test_dat.loc[df_test_emb.index, 'rating'] = res_scaled_mf

In [52]:
test_dat

Unnamed: 0,ID,user_id,item_id,book_name,rating
0,100000,0,406,"Ready Player One (Ready Player One, #1)",3.051794
1,100001,0,4462,The Return of the Indian (The Indian in the Cu...,3.236578
2,100002,0,36746,Give and Take: A Revolutionary Approach to Suc...,2.372277
3,100003,0,5433,"The Return of the King (The Lord of the Rings,...",3.262606
4,100004,0,1010,"Mockingjay (The Hunger Games, #3)",2.861698
...,...,...,...,...,...
56194,156194,2965,36375,نکته‌های ویرایش,2.863262
56195,156195,2965,50982,از ترمه و تغزل,2.270313
56196,156196,2424,57260,"The Seeing Stone (The Spiderwick Chronicles, #2)",3.134187
56197,156197,3077,30587,هیچ‌کس مثل تو مال این‌جا نیست,1.713181


In [53]:
#test_dat = test_dat.drop('predicted_rating', axis=1)

In [54]:
#test_dat = test_dat.drop('rating', axis=1)


In [55]:
kaggle_mf = test_dat[['ID','rating']]

In [56]:
kaggle_mf

Unnamed: 0,ID,rating
0,100000,3.051794
1,100001,3.236578
2,100002,2.372277
3,100003,3.262606
4,100004,2.861698
...,...,...
56194,156194,2.863262
56195,156195,2.270313
56196,156196,3.134187
56197,156197,1.713181


In [57]:
kaggle_mf['rating'] = kaggle_mf['rating'].round().astype(int)

In [58]:
#kaggle_mf.to_csv('kaggle_mf.csv', index = False)

## 4.2 MF with Bias

In [59]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return (U*V).sum(1) +  b_u  + b_v

In [60]:
bias_model = MF_bias(num_users, num_items, emb_size=100)

In [61]:
train_epocs(bias_model, epochs=10, lr=0.05, wd=1e-5)
train_epocs(bias_model, epochs=10, lr=0.01, wd=1e-5)
train_epocs(bias_model, epochs=10, lr=0.001, wd=1e-5)

14.693587303161621
10.770357131958008
5.522363662719727
1.3792730569839478
1.7133736610412598
3.413313150405884
2.5406646728515625
1.2144581079483032
0.8843367099761963
1.40267813205719
test loss 2.312 
2.075288772583008
1.43146812915802
0.982485294342041
0.7458831667900085
0.679992139339447
0.6935455203056335
0.7077181935310364
0.6919205784797668
0.6546518802642822
0.6190200448036194
test loss 0.915 
0.6037217378616333
0.5953822731971741
0.5889489650726318
0.5841619968414307
0.5807275772094727
0.5783566236495972
0.5767918229103088
0.5758203268051147
0.5752724409103394
0.575016438961029
test loss 0.906 


0.9055701494216919

In [62]:
test_dat = test_dat.drop('rating', axis=1)

In [63]:
Y_bias = bias_model(users, items)

# Scaling between 1-5
min_max = MinMaxScaler(feature_range=(1, 5))
res_scaled_bias = min_max.fit_transform(Y_bias.detach().numpy().reshape(-1, 1))[:,0]

# Find the indices that are missing in the original test data
missing_indices_bias = list(set(test_dat.index) - set(df_test_emb.index))

# Fill missing values with mean of predictions
mean_val_bias = np.mean(res_scaled_bias)
for index in missing_indices_bias:
    test_dat.loc[index, 'rating'] = mean_val_bias


test_dat.loc[df_test_emb.index, 'rating'] = res_scaled_bias


In [64]:
kaggle_bias = test_dat[['ID','rating']]

In [65]:
kaggle_bias['rating'] = kaggle_bias['rating'].round().astype(int)

In [66]:
#kaggle_bias.to_csv('kaggle_bias.csv', index = False)

# 5. Feed Forward Neural Network


In [67]:
class CollabFNet(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, n_hidden=10):
        super(CollabFNet, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.lin1 = nn.Linear(emb_size*2, n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        x = F.relu(torch.cat([U, V], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        x = self.lin2(x)
        return x

In [68]:
net = CollabFNet(num_users, num_items, emb_size=500, n_hidden=10)

optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

# generated with help of ChatGPT
train_users = torch.LongTensor(train_df.user_id.values)
train_items = torch.LongTensor(train_df.item_id.values)
train_ratings = torch.FloatTensor(train_df.rating.values).unsqueeze(1)  # adding an extra dimension


for epoch in range(16): 
    net.train()
    optimizer.zero_grad()
    
    prediction = net(train_users, train_items)
    loss = loss_fn(prediction, train_ratings)
    
    loss.backward()
    optimizer.step()
   
    
    print(f"Epoch {epoch}, Loss {loss.item()}")

Epoch 0, Loss 14.456596374511719
Epoch 1, Loss 12.329794883728027
Epoch 2, Loss 10.022412300109863
Epoch 3, Loss 7.879215240478516
Epoch 4, Loss 6.004152774810791
Epoch 5, Loss 4.423868656158447
Epoch 6, Loss 3.150796890258789
Epoch 7, Loss 2.1897385120391846
Epoch 8, Loss 1.531598687171936
Epoch 9, Loss 1.1589545011520386
Epoch 10, Loss 1.0394032001495361
Epoch 11, Loss 1.121405839920044
Epoch 12, Loss 1.337876319885254
Epoch 13, Loss 1.6195247173309326
Epoch 14, Loss 1.8937078714370728
Epoch 15, Loss 2.1085140705108643


In [69]:
net.eval() # set the model to evaluation mode
Y_CFN = net(users, items)

# Scaling between 0-1
min_max = MinMaxScaler(feature_range=(1, 5))
res_scaled_CFN = min_max.fit_transform(Y_CFN.detach().numpy())

# Flatten to a 1D array
res_scaled_CFN = res_scaled_CFN.flatten()

# Find the indices that are missing in the original test data
missing_indices_CFN = list(set(test_dat.index) - set(df_test_emb.index))

# Fill missing values with mean of predictions
mean_val_CFN = np.mean(res_scaled_CFN)
for index in missing_indices_CFN:
    test_dat.loc[index, 'predicted_rating'] = mean_val_CFN

test_dat.loc[df_test_emb.index, 'predicted_rating'] = res_scaled_CFN

In [70]:
#test_dat = test_dat.drop('predicted_rating', axis=1)

In [71]:
kaggle_NN = test_dat[['ID','predicted_rating']]

In [72]:
kaggle_NN['predicted_rating'] = kaggle_NN['predicted_rating'].round().astype(int)

In [73]:
kaggle_NN = kaggle_NN.rename(columns={'predicted_rating':'rating'})

In [74]:
#kaggle_NN.to_csv('kaggle_NN.csv', index = False)