In [0]:
# worked in colab
# from google.colab import drive
# drive.mount("/content/drive", force_remount=True)

In [0]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [0]:
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from multiprocessing import Pool, cpu_count

In [0]:
!cp 'drive/My Drive/news_rec_2020/train.zip' .
!cp 'drive/My Drive/news_rec_2020/processed_items.zip' .
!cp 'drive/My Drive/news_rec_2020/test.zip' .
!unzip -q 'train.zip'
!unzip -q 'processed_items.zip'
!unzip -q 'test.zip'

In [0]:
from tqdm import tqdm_notebook as tqdm

In [0]:
ratings = []
users = []
items = []

train_lines = sum(1 for line in open('train.json','r'))

with open('train.json') as train_file:
    for i, line in enumerate(tqdm(train_file, total=train_lines)):
        json_line = json.loads(line)
        for item, rating in json_line['trainRatings'].items():
            ratings.append(rating)
            users.append(i)
            items.append(int(item))

train_data = pd.DataFrame({'userId': users, 'itemId': items, 'rating': ratings})
train_data.head()

HBox(children=(IntProgress(value=0, max=42977), HTML(value='')))




Unnamed: 0,userId,itemId,rating
0,0,206495,0
1,0,279694,0
2,0,19718,0
3,0,74707,0
4,0,221548,0


In [0]:
del ratings
del users
del items

In [0]:
import torch
import gc
gc.collect()

0

In [0]:
class RatingDataset(torch.utils.data.Dataset):
    def __init__(self, train, label):
        self.feature_ = train
        self.label_ = label

    def __len__(self):
        return len(self.feature_)

    def __getitem__(self, idx):
        return torch.tensor(self.feature_[idx]), torch.tensor(self.label_[idx])

In [0]:
test = pd.read_csv('solution.csv')
test.head()

Unnamed: 0,userId,itemId,rating
0,1,43614,0
1,1,1397,0
2,1,171407,0
3,1,134305,0
4,1,2469,0


In [0]:
X_columns = ['userId', 'itemId']
y_column = 'rating'

In [0]:
from sklearn.preprocessing import LabelEncoder

In [0]:
user_enc = LabelEncoder()
user_enc.fit(np.concatenate((train_data['userId'].values, test['userId'].values)))
train_data['userId'] = user_enc.transform(train_data['userId'].values)
test['userId'] = user_enc.transform(test['userId'].values)

item_enc = LabelEncoder()
item_enc.fit(np.concatenate((train_data['itemId'].values, test['itemId'].values)))
train_data['itemId'] = item_enc.transform(train_data['itemId'].values)
test['itemId'] = item_enc.transform(test['itemId'].values)

train_data['rating'] = train_data['rating'].values.astype(np.float32)
test['rating'] = test['rating'].values.astype(np.float32)

In [0]:
batch_size = 512
train_dataloader = torch.utils.data.DataLoader(RatingDataset(train_data[X_columns].values, train_data[y_column].values), 
                                               batch_size=batch_size, 
                                               shuffle=True)
test_dataloader = torch.utils.data.DataLoader(RatingDataset(test[X_columns].values, test[y_column].values), 
                                              batch_size=batch_size)

In [0]:
n_users = pd.concat((train_data['userId'], test['userId'])).nunique()
n_items = pd.concat((train_data['itemId'], test['itemId'])).nunique()
n_users, n_items

(42977, 326921)

In [0]:
gc.collect()

0

In [0]:
class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
        self.user_biases = torch.nn.Embedding(n_users, 1)
        self.item_biases = torch.nn.Embedding(n_items, 1)
        torch.nn.init.xavier_uniform_(self.user_factors.weight)
        torch.nn.init.xavier_uniform_(self.item_factors.weight)
        self.user_biases.weight.data.fill_(0.)
        self.item_biases.weight.data.fill_(0.)
        
    def forward(self, user, item):
        pred = self.user_biases(user) + self.item_biases(item)
        pred += (self.user_factors(user) * self.item_factors(item)).sum(1, keepdim=True)
        pred = torch.sigmoid(pred.squeeze())
        return pred

n_factors = 20
model = MatrixFactorization(n_users, n_items, n_factors=n_factors)

In [0]:
model

MatrixFactorization(
  (user_factors): Embedding(42977, 20)
  (item_factors): Embedding(326921, 20)
  (user_biases): Embedding(42977, 1)
  (item_biases): Embedding(326921, 1)
)

In [0]:
import numpy as np


def dcg_at_k(r, k, method=0):
    """Score is discounted cumulative gain (dcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> dcg_at_k(r, 1)
    3.0
    >>> dcg_at_k(r, 1, method=1)
    3.0
    >>> dcg_at_k(r, 2)
    5.0
    >>> dcg_at_k(r, 2, method=1)
    4.2618595071429155
    >>> dcg_at_k(r, 10)
    9.6051177391888114
    >>> dcg_at_k(r, 11)
    9.6051177391888114
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_score_solo(r, k=20, method=1):
    """Score is normalized discounted cumulative gain (ndcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> ndcg_at_k(r, 1)
    1.0
    >>> r = [2, 1, 2, 0]
    >>> ndcg_at_k(r, 4)
    0.9203032077642922
    >>> ndcg_at_k(r, 4, method=1)
    0.96519546960144276
    >>> ndcg_at_k([0], 1)
    0.0
    >>> ndcg_at_k([1], 2)
    1.0
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Normalized discounted cumulative gain
    """

    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max


def ndcg_score(actual, predicted, user_column='userId', item_column='itemId'):
    """
    Calculating NDCG@20
    :param actual: ground truth dataframe
    :param predicted: model prediction dataframe
    :param user_column: column name containing users' ids
    :param item_column: column name containing items' ids
    :return: ndcg score
    """
    assert actual.shape[0] == predicted.shape[0]
    res_ndcg = 0.0

    for userId in actual[user_column].unique():
        actual_pos = {}

        for index, row in actual[actual[user_column] == userId].iterrows():
            actual_pos[row[item_column]] = row['rating']

        predicted_order = predicted[predicted[user_column] == userId][item_column]
        predicted_ranks = [actual_pos[item] for item in predicted_order]
        res_ndcg += ndcg_score_solo(predicted_ranks)

    res_ndcg /= actual[user_column].nunique()
    return res_ndcg


In [0]:
from tqdm import tqdm_notebook as tqdm
from IPython.display import clear_output

In [0]:
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

loss_func = torch.nn.BCELoss()
model.to(dev)

epochs = 1000
for epoch in range(0, epochs):
    count = 0
    cum_loss = 0.
    for i, (train_batch, label_batch) in enumerate(train_dataloader):
        count = 1 + i
        # Predict and calculate loss for user factor and bias
        optimizer = torch.optim.Adam([model.user_biases.weight, model.user_factors.weight], weight_decay=1e-5) 
        prediction = model(train_batch[:,0].to(dev), train_batch[:,1].to(dev))
        loss = loss_func(prediction, label_batch.to(dev))    
        # Backpropagate
        loss.backward()
        
        # Update the parameters
        optimizer.step()
        optimizer.zero_grad()
        
        #predict and calculate loss for item factor and bias
        optimizer = torch.optim.Adam([model.item_biases.weight, model.item_factors.weight], weight_decay=1e-5) 
        prediction = model(train_batch[:,0].to(dev), train_batch[:,1].to(dev))
        loss = loss_func(prediction, label_batch.to(dev))
        # Backpropagate
        loss.backward()
        
        # Update the parameters
        optimizer.step()
        optimizer.zero_grad()
        cum_loss += loss.item()
        clear_output()
        print('training loss at {} batch {}/{}: {:.4f}'.format(epoch, i, len(train_dataloader), loss.item()))



    train_loss = cum_loss / count
    cum_loss = 0.
    count = 0
    for i, (test_batch, label_batch) in enumerate(test_dataloader):
        count = 1 + i
        with torch.no_grad():
            prediction = model(test_batch[:,0].to(dev), test_batch[:,1].to(dev))
            # Count ndcg@20
            loss = loss_func(prediction, label_batch.to(dev))
            cum_loss += loss.item()
            clear_output()
            print('test loss at {} batch {}: {}'.format(epoch,i,loss.item()))
    test_loss = cum_loss / count
    torch.save(model.state_dict(), 'drive/My Drive/news_rec_2020/models/neuFM_epoch-{}_test-{:.4f}'.format(epoch, test_loss))
    print('avg training loss: ', train_loss, ' avg test loss: ',test_loss)

RuntimeError: ignored

In [0]:
model.load_state_dict(torch.load('drive/My Drive/news_rec_2020/models/neuFM_epoch-1_test-0.6931', map_location=torch.device('cpu')))
model.eval()

MatrixFactorization(
  (user_factors): Embedding(42977, 20)
  (item_factors): Embedding(326921, 20)
  (user_biases): Embedding(42977, 1)
  (item_biases): Embedding(326921, 1)
)

In [0]:
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(dev)

y_pred = []
for test_batch, label_batch in tqdm(test_dataloader):
    with torch.no_grad():
        prediction = model(test_batch[:,0].to(dev), test_batch[:,1].to(dev))
        y_pred.extend(list(prediction.cpu().numpy()))
        

HBox(children=(IntProgress(value=0, max=5895), HTML(value='')))




In [0]:
prediction = test.copy()
prediction['rating'] = y_pred
prediction.sort_values(['userId', 'itemId'], ascending=(True, False), inplace=True)
prediction.drop(columns=['rating'], inplace=True)
prediction.head()

Unnamed: 0,userId,itemId
25,1,326752
335,1,325660
571,1,324038
275,1,322527
147,1,321847


In [0]:
ndcg_score(test, prediction)

0.16659483647983728

In [0]:
prediction.to_csv('submission_with_neuFM.csv', index=False)

In [0]:
kaggle competitions submit -c recsys-iad-challenge -f submission_with_neuFM.csv -m "NeuFM"

SyntaxError: ignored