#### Load Libraries

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import random
import pandas as pd
import numpy as np
import pickle

In [2]:
df = pd.read_csv('training.csv')
item = pd.read_csv('item_feature.csv')
df.head()

Unnamed: 0,user_id,item_id,context_feature_id
0,0,28366,2
1,0,16109,2
2,0,11500,3
3,0,20750,2
4,0,8759,2


#### Get frequency of context feature id categories

In [59]:
df.context_feature_id.value_counts()

2    485857
1    270187
3    145267
0     68934
Name: context_feature_id, dtype: int64

#### Assign rating label 1 to users who interacted with the item

In [60]:
df['rating'] = 1
df.head()

Unnamed: 0,user_id,item_id,context_feature_id,rating
0,0,28366,2,1
1,0,16109,2,1
2,0,11500,3,1
3,0,20750,2,1
4,0,8759,2,1


#### User oriented negative sampling 

In [62]:
unique_users = df.user_id.unique()
unique_items = df.item_id.unique()
d = {}
for i in unique_users:
    user_rated_items = np.array(df[df['user_id']==i]['item_id'])
    n_items = len(user_rated_items)
    unrated_items = np.setdiff1d(unique_items,user_rated_items)
    select_unrated_items = np.random.choice(unrated_items,size = n_items*2, replace=False)
    d[i] = select_unrated_items

output = pd.DataFrame([d]).transpose()
output.insert(0, column="user_id",value = output.index)
# run and save to pickle



#### Assing rating label 0

In [63]:
output.columns = ['user_id','item_id']
output = output.explode('item_id')
output.reset_index(drop=True, inplace=True)
output['rating'] = 0

In [64]:
output['context_feature_id'] = 4
output

Unnamed: 0,user_id,item_id,rating,context_feature_id
0,0,32116,0,4
1,0,15182,0,4
2,0,37133,0,4
3,0,4940,0,4
4,0,21859,0,4
...,...,...,...,...
1940485,200152,17833,0,4
1940486,200152,32288,0,4
1940487,200152,30511,0,4
1940488,200152,9574,0,4


#### Concatenate the two dataframes

In [65]:
#join datatset to create pd
sampled_df = pd.concat([df,output]).reset_index(drop=True)

In [68]:
merged_df = sampled_df.merge(item,how = 'left')
data = merged_df

In [110]:
data.rating.value_counts()

0    970245
1    970245
Name: rating, dtype: int64

#### Store the dataframe in a pickle file

In [63]:
# store data
f = open("features_data.pkl","wb")

# # write the python object (dict) to pickle file
pickle.dump(data,f)

# # close file
f.close()

In [3]:
# bring pickle data in
with open('features_data.pkl', 'rb') as f:
    data = pickle.load(f)

#### Change column data types

In [69]:
for i in data.columns:
    data[i] = data[i].astype('int64')

In [70]:
x_train, x_val = train_test_split(data, test_size=0.3)

In [6]:
from torch.utils.data import Dataset, DataLoader

#### Define the model architecture

In [71]:
class model(nn.Module):
    def __init__(self, num_users, num_items, num_feature_id, emb_size=100, lin_neuron=20):
        # deleted context
        super(model, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.feature_emb = nn.Embedding(num_feature_id, emb_size)
        self.linear1 = nn.Linear(emb_size*3, lin_neuron)
        self.linear2 = nn.Linear(lin_neuron, 1)
        ## increased dropouts
        self.drop1 = nn.Dropout(0.3)
        self.drop2 = nn.Dropout(0.0)
        self.dense_bn = nn.BatchNorm1d(lin_neuron)
        
    def forward(self, user, item, feature):
        u = self.user_emb(user)
        i = self.item_emb(item)
        f = self.feature_emb(feature)
        
        x = torch.cat([u,i,f],dim=1) 
        x = self.drop1(x)
        x = F.relu(self.dense_bn(self.linear1(x)))
        x = self.drop2(x)
        x = self.linear2(x)
        return x

#### Function to compute Validation Metrics

In [72]:
def valid_metrics(model, valid_df):
    """Computes validation loss and accuracy"""
    model.eval()
    u = torch.LongTensor(valid_df.user_id.values)
    i = torch.LongTensor(valid_df.item_id.values)
    f = torch.LongTensor(valid_df.item_feature_id.values)
    y = torch.FloatTensor(valid_df.rating.values).unsqueeze(1)

    y_hat = model(u,i,f) #deleted c
    valid_loss = F.binary_cross_entropy_with_logits(y_hat,y)
    
    return valid_loss.item()

#### Training Method

In [73]:
def train(model, train_df, valid_df, epochs = 20, lr=0.01, wd= 0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr= lr, weight_decay= wd)
    for i in range(epochs):
        model.train()

        u = torch.LongTensor(train_df.user_id.values)
        i = torch.LongTensor(train_df.item_id.values)
        f = torch.LongTensor(train_df.item_feature_id.values)
        y = torch.FloatTensor(train_df.rating.values).unsqueeze(1)
        
        y_hat = model(u,i,f) #deleted c
        train_loss = F.binary_cross_entropy_with_logits(y_hat,y)
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        
        valid_loss = valid_metrics(model, valid_df)
        print("train loss %.3f valid loss %.3f" % (train_loss.item(), valid_loss)) 

#### Intitalize num_users, num_items, num_feature

In [74]:
num_users = max(x_train.user_id.values+1)
num_items = max(x_train.item_id.values+1)
num_feature = max(x_train.item_feature_id.values+1)

In [75]:
# emb_size = 40, neuron = 80 for 10 epochs, lr = 0.01, wd = 0.00001
emb_model = model(num_users, num_items, num_feature, emb_size = 30, lin_neuron= 80)
train(emb_model, x_train, x_val, epochs=10, lr=.1, wd=1e-5)

train loss 0.731 valid loss 1.318
train loss 1.198 valid loss 1.216
train loss 0.833 valid loss 0.740
train loss 0.618 valid loss 0.655
train loss 0.679 valid loss 0.805
train loss 0.734 valid loss 0.653
train loss 0.672 valid loss 0.558
train loss 0.611 valid loss 0.555
train loss 0.579 valid loss 0.568
train loss 0.561 valid loss 0.565


In [76]:
train(emb_model, x_train, x_val, epochs=10, lr=.1, wd=1e-5)

train loss 0.546 valid loss 0.462
train loss 0.482 valid loss 0.366
train loss 0.389 valid loss 0.385
train loss 0.348 valid loss 0.464
train loss 0.332 valid loss 0.470
train loss 0.320 valid loss 0.431
train loss 0.315 valid loss 0.396
train loss 0.314 valid loss 0.376
train loss 0.314 valid loss 0.368
train loss 0.310 valid loss 0.365


In [77]:
train(emb_model, x_train, x_val, epochs=5, lr=.01, wd=1e-5)

train loss 0.307 valid loss 0.346
train loss 0.303 valid loss 0.338
train loss 0.301 valid loss 0.333
train loss 0.300 valid loss 0.326
train loss 0.299 valid loss 0.320


In [78]:
train(emb_model, x_train, x_val, epochs=5, lr=.01, wd=1e-5)

train loss 0.298 valid loss 0.313
train loss 0.298 valid loss 0.313
train loss 0.297 valid loss 0.311
train loss 0.297 valid loss 0.309
train loss 0.296 valid loss 0.306


In [79]:
train(emb_model, x_train, x_val, epochs=5, lr=.01, wd=1e-5)

train loss 0.295 valid loss 0.305
train loss 0.295 valid loss 0.304
train loss 0.294 valid loss 0.304
train loss 0.293 valid loss 0.303
train loss 0.292 valid loss 0.302


In [80]:
train(emb_model, x_train, x_val, epochs=5, lr=.01, wd=1e-5)

train loss 0.291 valid loss 0.303
train loss 0.290 valid loss 0.302
train loss 0.288 valid loss 0.302
train loss 0.286 valid loss 0.302
train loss 0.284 valid loss 0.302


In [190]:
# emb_size = 40, neuron = 80 for 10 epochs, lr = 0.01, wd = 0.00001
emb_model = model(num_users, num_items, num_context, num_feature,
                    emb_size = 40, lin_neuron= 80)
train(emb_model, x_train, x_val, epochs=10, lr=.01, wd=1e-5)

train loss 0.777 valid loss 0.518
train loss 0.456 valid loss 0.364
train loss 0.291 valid loss 0.248
train loss 0.194 valid loss 0.163
train loss 0.133 valid loss 0.104
train loss 0.093 valid loss 0.064
train loss 0.066 valid loss 0.039
train loss 0.047 valid loss 0.024
train loss 0.035 valid loss 0.015
train loss 0.026 valid loss 0.009


In [185]:
train(emb_model, x_train, x_val, epochs=10, lr=.01, wd=0)

train loss 0.231 valid loss 0.059
train loss 0.202 valid loss 0.043
train loss 0.178 valid loss 0.030
train loss 0.159 valid loss 0.020
train loss 0.144 valid loss 0.013
train loss 0.132 valid loss 0.008
train loss 0.121 valid loss 0.005
train loss 0.112 valid loss 0.004
train loss 0.105 valid loss 0.002
train loss 0.099 valid loss 0.002


In [64]:
# higher learning rate
emb_model = model(num_users, num_items, num_context, num_feature,
                    emb_size = 20, lin_neuron= 50)
train(emb_model, train_dl, valid_dl, epochs=5, lr=0.1, wd=1e-5)

train loss 0.667 valid loss 0.365
train loss 0.232 valid loss 0.094
train loss 0.014 valid loss 0.007
train loss 0.000 valid loss 0.001
train loss 0.000 valid loss 0.000


In [81]:
torch.save(emb_model.state_dict(), 'models/nn_emb_new_neg_sample.pth')

In [82]:
test = pd.read_csv('test_kaggle.csv')

In [83]:
test_merged = test.merge(item,how = 'left')

#### Prediction

In [84]:
test_users = torch.LongTensor(test_merged.user_id.values)
test_items = torch.LongTensor(test_merged.item_id.values) 
test_context = torch.LongTensor(test_merged.context_feature_id.values)
test_feature = torch.LongTensor(test_merged.item_feature_id.values)
y_hat = emb_model(test_users, test_items, test_feature)

In [88]:
torch.max(torch.sigmoid(y_hat)),torch.min(torch.sigmoid(y_hat))

(tensor(0.9979, grad_fn=<MaxBackward1>),
 tensor(0.0009, grad_fn=<MinBackward1>))

In [86]:
test['rating'] = torch.sigmoid(y_hat).detach().numpy()
submission_df = test[['id','rating']].copy()
submission_df.to_csv('nn_emb_new_sample.csv', index=False)