In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn 
import torch.nn.functional as F

from sklearn.metrics import balanced_accuracy_score

### Data Pre-Processing
---

In [2]:
def train_test_split(df):
    np.random.seed(630)
    msk = np.random.rand(len(df)) < 0.8
    train = df[msk].reset_index()
    valid = df[~msk].reset_index()
    return train, valid

In [3]:
def init_model_and_data(train):
    train_df = encode_data(train, train=None)
    num_users = len(train.user_id.unique())
    num_items = len(train.item_id.unique())
    model = MF(num_users, num_items)
    return model, train_df

In [4]:
def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [5]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["user_id", "item_id"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [47]:
def encode_item_features(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["item_feature_id"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [6]:
def add_samples(df, k, all_users, all_items, user_context_features):
    sample_df = {'user_id':np.empty(df.shape[0]*k),
        'item_id':np.empty(df.shape[0]*k),
        'rating':np.empty(df.shape[0]*k),
        'context_feature_id':np.empty(df.shape[0]*k)}
    for user in all_users:
        split_index = np.where(df[:,0] == user)[0]
        user_df = df[split_index[0]:split_index[-1]+1]
        
        zero_samples = sample_zeros(user_df, k, all_items)
        user_col = np.full(user_df.shape[0]*k, user)
        zero_col = np.zeros(user_df.shape[0]*k)

        context_features = np.random.choice(user_context_features[user_df[0,0]],
         user_df.shape[0]*k)

        # sample_df = {'user_id':user_col, 'item_id':zero_samples, 'rating':zero_col}
        sample_df['user_id'][split_index[0]:split_index[-1]+1] = user_col
        sample_df['item_id'][split_index[0]:split_index[-1]+1] = zero_samples
        sample_df['rating'][split_index[0]:split_index[-1]+1] = zero_col
        sample_df['context_feature_id'][split_index[0]:split_index[-1]+1] = context_features


    return pd.DataFrame(sample_df)

In [7]:
def sample_zeros(user_df, k, all_items):
    sample = []
    
    while len(sample) < k*len(user_df[:,1]):
        raw_samp = np.random.randint(0, len(all_items))
        if raw_samp not in user_df[:,1] and raw_samp not in sample:
            sample.append(raw_samp)

    return sample

In [8]:
training = pd.read_csv('training.csv')
training['rating'] = 1
item_feature = pd.read_csv('item_feature.csv')
test = pd.read_csv('test_kaggle.csv')

train_df = encode_data(df=training)
train = encode_data(df=training).to_numpy()

all_users = pd.unique(train[:,0])
all_items = pd.unique(train[:,1])

user_context_features = train_df.groupby('user_id').agg({'context_feature_id':list}).to_dict()

In [9]:
%%time
negative_samples = add_samples(train, 1, all_users, all_items, user_context_features)

CPU times: user 1min 28s, sys: 441 ms, total: 1min 28s
Wall time: 1min 28s


In [61]:
train_df.groupby('user_id').agg({'context_feature_id':list}).to_dict()

{'context_feature_id': {0: [2, 2, 3, 2, 2, 2],
  1: [2, 2, 3, 2],
  2: [1, 1, 3, 3, 1, 1],
  3: [2, 1, 3, 1],
  4: [1, 1, 1, 1, 1],
  5: [2, 2, 2, 2],
  6: [1, 3, 2, 2],
  7: [1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  8: [2, 2, 2, 2],
  9: [2, 2, 2, 2],
  10: [2, 2, 2, 1, 2, 3, 3, 2, 2, 2, 2],
  11: [1, 1, 1, 1, 1],
  12: [1, 1, 1, 1, 1, 1, 1],
  13: [2,
   2,
   2,
   3,
   3,
   3,
   2,
   2,
   2,
   2,
   3,
   3,
   2,
   2,
   0,
   2,
   2,
   2,
   0,
   2,
   0,
   3,
   2,
   2,
   2,
   2,
   2,
   2,
   2,
   2,
   2,
   2,
   2,
   3,
   2,
   2,
   3,
   3,
   2,
   2,
   2],
  14: [2, 1, 0, 2, 2, 2, 2, 2, 2, 2],
  15: [2, 3, 2, 3],
  16: [2, 3, 2, 2, 2, 3],
  17: [1, 1, 1, 1, 1],
  18: [1, 1, 1, 1, 1, 1],
  19: [2, 2, 2, 2],
  20: [2, 3, 2, 2, 2, 2],
  21: [2, 3, 2, 3, 2, 2],
  22: [2, 2, 2, 2, 3],
  23: [2, 3, 2, 2, 2, 2],
  24: [0, 2, 1, 2, 1, 1, 0, 2],
  25: [2, 3, 2, 2, 2, 2, 3, 2, 2, 2],
  26: [1, 1, 1, 1],
  27: [0, 0, 0, 0],
  28: [1, 1, 1, 1],
  29: [1, 1, 1, 1]

In [53]:
user_context_features

{0: [2, 3],
 1: [2, 3],
 2: [1, 3],
 3: [1, 2, 3],
 4: [1],
 5: [2],
 6: [1, 2, 3],
 7: [1, 2],
 8: [2],
 9: [2],
 10: [1, 2, 3],
 11: [1],
 12: [1],
 13: [0, 2, 3],
 14: [0, 1, 2],
 15: [2, 3],
 16: [2, 3],
 17: [1],
 18: [1],
 19: [2],
 20: [2, 3],
 21: [2, 3],
 22: [2, 3],
 23: [2, 3],
 24: [0, 1, 2],
 25: [2, 3],
 26: [1],
 27: [0],
 28: [1],
 29: [1],
 30: [1, 2],
 31: [2, 3],
 32: [2, 3],
 33: [2, 3],
 34: [2, 3],
 35: [2, 3],
 36: [1, 2],
 37: [2, 3],
 38: [2],
 39: [2, 3],
 40: [2],
 41: [1, 2],
 42: [0, 2],
 43: [1, 2],
 44: [2, 3],
 45: [2],
 46: [2, 3],
 47: [1, 2, 3],
 48: [1],
 49: [1, 2],
 50: [1],
 51: [2, 3],
 52: [2],
 53: [2],
 54: [1],
 55: [2, 3],
 56: [2, 3],
 57: [0, 1, 2, 3],
 58: [0, 2],
 59: [2],
 60: [1],
 61: [1],
 62: [2, 3],
 63: [1, 2],
 64: [2, 3],
 65: [1],
 66: [1, 2],
 67: [2, 3],
 68: [1],
 69: [0, 1],
 70: [1, 2],
 71: [1],
 72: [2],
 73: [1, 2, 3],
 74: [1, 2, 3],
 75: [2],
 76: [1],
 77: [1, 2, 3],
 78: [1, 2],
 79: [2, 3],
 80: [2],
 81: [1, 2, 3]

In [51]:
%%time
train_df_new = pd.concat([negative_samples, train_df])
df_with_item_feature = train_df_new.merge(item_feature, how='left')
train_df_new = encode_item_features(df_with_item_feature)
train_df_new.reset_index(drop=True, inplace=True)

CPU times: user 889 ms, sys: 47.5 ms, total: 937 ms
Wall time: 936 ms


In [52]:
train_df_new

Unnamed: 0,user_id,item_id,rating,context_feature_id,item_feature_id
0,0.0,24415.0,0.0,2.0,0
1,0.0,1758.0,0.0,2.0,1
2,0.0,4525.0,0.0,3.0,2
3,0.0,31023.0,0.0,3.0,3
4,0.0,34966.0,0.0,3.0,1
...,...,...,...,...,...
1940485,169697.0,1074.0,1.0,2.0,15
1940486,169697.0,1074.0,1.0,2.0,15
1940487,169697.0,244.0,1.0,2.0,4
1940488,169697.0,1262.0,1.0,2.0,29


In [17]:
train_df_new.to_feather('training_sampled')

In [11]:
train_df_new = pd.read_feather('training_sampled')
train_df_new

Unnamed: 0,level_0,user_id,item_id,rating,context_feature_id
0,0,0,6712,0,4
1,1,0,15507,0,4
2,2,0,32760,0,4
3,3,0,11321,0,4
4,4,0,33311,0,4
...,...,...,...,...,...
1940485,1940485,169697,1074,1,2
1940486,1940486,169697,1074,1,2
1940487,1940487,169697,244,1,2
1940488,1940488,169697,1262,1,2


In [43]:
# train_df_new = train_df_new.reset_index()
# train_df_new['context_feature_id'] = train_df_new['context_feature_id'].fillna(4)
# train_df_new = train_df_new.astype('int')
# # train_df_new = train_df_new.drop(columns=['level_0', 'index'])
# train_df_new = train_df_new.drop(columns=['index'])

In [13]:
# train_df_new.to_feather('training_sampled')

In [18]:
training_df, valid_df = train_test_split(train_df_new)

### Matrix Factorization Model
---

In [13]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, seed=630):
        super(MF_bias, self).__init__()
        torch.manual_seed(seed)
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)

    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return ((U*V).sum(1) + b_u + b_v)

In [14]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    for i in range(epochs):
        model.train()
        users = torch.LongTensor(training_df.user_id.values)  
        items = torch.LongTensor(training_df.item_id.values) 
        ratings = torch.FloatTensor(training_df.rating.values)  
    
        y_hat = model(users, items)
        loss = F.binary_cross_entropy_with_logits(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        testloss, acc = valid_loss(model)
        print("train loss %.3f valid loss %.3f accuracy %.3f" % (loss.item(), testloss, acc)) 

In [15]:
def valid_loss(model):
    model.eval()
    users = torch.LongTensor(valid_df.user_id.values)  
    items = torch.LongTensor(valid_df.item_id.values) 
    ratings = torch.FloatTensor(valid_df.rating.values)  
    y_hat = model(users, items)
    loss = F.binary_cross_entropy_with_logits(y_hat, ratings)
    preds = (torch.sigmoid(y_hat) > 0.5).float()
    return loss.item(), balanced_accuracy_score(ratings, preds)

In [16]:
num_users = len(train_df_new['user_id'].unique())
num_items = len(train_df_new['item_id'].unique())
model = MF_bias(num_users, num_items, emb_size=100)

In [17]:
train_epocs(model)

train loss 0.694 valid loss 0.686 accuracy 0.503
train loss 0.679 valid loss 0.676 accuracy 0.616
train loss 0.664 valid loss 0.663 accuracy 0.820
train loss 0.646 valid loss 0.646 accuracy 0.856
train loss 0.623 valid loss 0.625 accuracy 0.855
train loss 0.597 valid loss 0.600 accuracy 0.856
train loss 0.568 valid loss 0.572 accuracy 0.856
train loss 0.536 valid loss 0.542 accuracy 0.857
train loss 0.503 valid loss 0.511 accuracy 0.857
train loss 0.469 valid loss 0.481 accuracy 0.857


In [18]:
train_epocs(model, epochs=5, lr=0.001)

train loss 0.435 valid loss 0.478 accuracy 0.857
train loss 0.432 valid loss 0.475 accuracy 0.857
train loss 0.429 valid loss 0.472 accuracy 0.857
train loss 0.426 valid loss 0.469 accuracy 0.857
train loss 0.422 valid loss 0.466 accuracy 0.857


In [19]:
# ============================ #
# HYPER PARAMETER EXPERIMENTATION
# ============================ #
model2 = MF_bias(num_users, num_items, emb_size=200)
train_epocs(model2, epochs=10, lr=0.1)

train loss 0.695 valid loss 0.684 accuracy 0.607
train loss 0.605 valid loss 0.413 accuracy 0.850
train loss 0.293 valid loss 0.442 accuracy 0.847
train loss 0.210 valid loss 0.478 accuracy 0.845
train loss 0.131 valid loss 0.496 accuracy 0.845
train loss 0.072 valid loss 0.514 accuracy 0.849
train loss 0.037 valid loss 0.544 accuracy 0.851
train loss 0.019 valid loss 0.586 accuracy 0.851
train loss 0.010 valid loss 0.639 accuracy 0.852
train loss 0.005 valid loss 0.696 accuracy 0.852


In [20]:
model3 = MF_bias(num_users, num_items, emb_size=200)
train_epocs(model3, epochs=10, lr=0.01)

train loss 0.695 valid loss 0.681 accuracy 0.500
train loss 0.671 valid loss 0.665 accuracy 0.591
train loss 0.646 valid loss 0.643 accuracy 0.843
train loss 0.617 valid loss 0.614 accuracy 0.857
train loss 0.582 valid loss 0.579 accuracy 0.857
train loss 0.541 valid loss 0.541 accuracy 0.856
train loss 0.498 valid loss 0.501 accuracy 0.857
train loss 0.454 valid loss 0.462 accuracy 0.858
train loss 0.411 valid loss 0.429 accuracy 0.858
train loss 0.374 valid loss 0.402 accuracy 0.858


In [21]:
train_epocs(model3, epochs=5, lr=0.005)

train loss 0.342 valid loss 0.390 accuracy 0.858
train loss 0.327 valid loss 0.380 accuracy 0.858
train loss 0.313 valid loss 0.372 accuracy 0.858
train loss 0.300 valid loss 0.366 accuracy 0.858
train loss 0.289 valid loss 0.360 accuracy 0.857


In [22]:
train_epocs(model3, epochs=5, lr=0.005)

train loss 0.279 valid loss 0.356 accuracy 0.857
train loss 0.269 valid loss 0.353 accuracy 0.857
train loss 0.259 valid loss 0.351 accuracy 0.857
train loss 0.250 valid loss 0.349 accuracy 0.857
train loss 0.242 valid loss 0.348 accuracy 0.857


In [23]:
train_epocs(model3, epochs=5, lr=0.005)

train loss 0.234 valid loss 0.347 accuracy 0.858
train loss 0.225 valid loss 0.347 accuracy 0.858
train loss 0.217 valid loss 0.346 accuracy 0.858
train loss 0.209 valid loss 0.346 accuracy 0.858
train loss 0.202 valid loss 0.347 accuracy 0.858


In [24]:
model4 = MF_bias(num_users, num_items, emb_size=500)
train_epocs(model4, epochs=15, lr=0.01)

train loss 0.705 valid loss 0.671 accuracy 0.500
train loss 0.654 valid loss 0.641 accuracy 0.503
train loss 0.610 valid loss 0.603 accuracy 0.855
train loss 0.561 valid loss 0.556 accuracy 0.859
train loss 0.505 valid loss 0.504 accuracy 0.857
train loss 0.446 valid loss 0.455 accuracy 0.857
train loss 0.391 valid loss 0.415 accuracy 0.858
train loss 0.343 valid loss 0.387 accuracy 0.858
train loss 0.306 valid loss 0.372 accuracy 0.858
train loss 0.279 valid loss 0.368 accuracy 0.858
train loss 0.258 valid loss 0.370 accuracy 0.858
train loss 0.241 valid loss 0.374 accuracy 0.858
train loss 0.226 valid loss 0.380 accuracy 0.858
train loss 0.210 valid loss 0.385 accuracy 0.858
train loss 0.194 valid loss 0.390 accuracy 0.858


## Embedding Model

In [30]:
class Embedding_Model(nn.Module):
    def __init__(self, num_context=4, num_item_feature=194, emb_size=100, seed=630):
        super(Embedding_Model, self).__init__()
        torch.manual_seed(seed)
        self.context_emb = nn.Embedding(num_context, emb_size)
        self.item_feature_emb = nn.Embedding(num_item_feature, emb_size)

        self.linear1 = nn.Linear(2*emb_size, 4*emb_size)
        self.linear2 = nn.Linear(4*emb_size, emb_size)
        self.linear3 = nn.Linear(emb_size, 1)

        # self.user_emb.weight.data.uniform_(0,0.05)
        # self.item_emb.weight.data.uniform_(0,0.05)
        # self.user_bias.weight.data.uniform_(-0.01,0.01)
        # self.item_bias.weight.data.uniform_(-0.01,0.01)

    def forward(self, u, v):
        U = self.context_emb(u)
        V = self.item_feature_emb(v)
        X = torch.cat([U,V], 1)
        X = self.linear1(X)
        X = F.relu(X)
        X = self.linear2(X)
        X = F.relu(X)
        X = self.linear3(X)

        return X

In [33]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    for i in range(epochs):
        model.train()
        context_features = torch.LongTensor(training_df.context_feature_id)  
        item_features = torch.LongTensor(training_df.item_feature_id) 
        ratings = torch.FloatTensor(training_df.rating.values)  
    
        y_hat = model(context_features, item_features)
        loss = F.binary_cross_entropy_with_logits(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        testloss, acc = valid_loss(model)
        print("train loss %.3f valid loss %.3f accuracy %.3f" % (loss.item(), testloss, acc)) 

In [34]:
df_with_item_feature = train_df_new.merge(item_feature, how='left')
df_with_item_feature.head()

Unnamed: 0,user_id,item_id,rating,context_feature_id,item_feature_id
0,0.0,24415.0,0.0,2.0,162
1,0.0,1758.0,0.0,2.0,148
2,0.0,4525.0,0.0,3.0,68
3,0.0,31023.0,0.0,3.0,142
4,0.0,34966.0,0.0,3.0,148


In [35]:
training_df, valid_df = train_test_split(df_with_item_feature)

In [36]:
model_embed = Embedding_Model(emb_size=100)
train_epocs(model_embed, epochs=10, lr=0.1)

  context_features = torch.LongTensor(training_df.context_feature_id)


IndexError: index out of range in self

In [41]:
context_features

tensor([2, 2, 3,  ..., 2, 2, 2])

In [42]:
item_features

tensor([162, 148,  68,  ..., 139,  35, 148])

In [40]:
model_embed(context_features, item_features)

IndexError: index out of range in self

In [39]:
context_features = torch.LongTensor(training_df.context_feature_id)  
item_features = torch.LongTensor(training_df.item_feature_id) 

  context_features = torch.LongTensor(training_df.context_feature_id)


In [47]:
df_with_item_feature

Unnamed: 0,user_id,item_id,rating,context_feature_id,item_feature_id
0,0,6712,0,4,148
1,0,15507,0,4,84
2,0,32760,0,4,111
3,0,11321,0,4,2
4,0,33311,0,4,19
...,...,...,...,...,...
1940485,169697,1074,1,2,138
1940486,169697,1074,1,2,138
1940487,169697,244,1,2,139
1940488,169697,1262,1,2,35


In [48]:
test.merge(item_feature, how='left').drop(['level_0'], axis=1)

Unnamed: 0,id,user_id,item_id,context_feature_id
0,0,4,16835,2
1,1,4,22590,3
2,2,4,1978,1
3,3,4,28916,1
4,4,4,14427,2
...,...,...,...,...
381380,381380,200151,1702,1
381381,381381,200151,21632,1
381382,381382,200151,30477,1
381383,381383,200151,30477,1


In [34]:
training_df

Unnamed: 0,index,level_0,user_id,item_id,rating,context_feature_id
0,0,0,0,6712,0,4
1,1,1,0,15507,0,4
2,2,2,0,32760,0,4
3,3,3,0,11321,0,4
4,7,7,1,25550,0,4
...,...,...,...,...,...,...
1552768,1940485,1940485,169697,1074,1,2
1552769,1940486,1940486,169697,1074,1,2
1552770,1940487,1940487,169697,244,1,2
1552771,1940488,1940488,169697,1262,1,2


In [42]:
item_feature[item_feature['item_id'] == 6712]

Unnamed: 0,item_id,item_feature_id
6712,6712,148


In [None]:
training_df, valid_df = train_test_split(train_df_new)

## Predicting for Observed Users

In [27]:
#Testing
test = pd.read_csv('test_kaggle.csv')
test_cold_start = test.copy()

In [28]:
test = encode_data(test, training)

In [29]:
model3.eval()
users = torch.LongTensor(test.user_id.values)  
items = torch.LongTensor(test.item_id.values) 
y_hat = model3(users, items)

In [30]:
final_prediction = torch.sigmoid(y_hat).detach().numpy()
test['rating'] = final_prediction

In [31]:
pred_df = pd.DataFrame({'rating':final_prediction})

In [32]:
new_users = set(test_cold_start['user_id'].unique()) - set(training['user_id'].unique() )
new_items = set(test_cold_start['item_id'].unique()) - set(training['item_id'].unique() )
test_cold_start = test_cold_start[test_cold_start['user_id'].isin(new_users) | test_cold_start['item_id'].isin(new_items)]
test_cold_start['rating'] = 0.5

In [33]:
test_cold_start

Unnamed: 0,id,user_id,item_id,context_feature_id,rating
0,0,4,16835,2,0.5
1,1,4,22590,3,0.5
2,2,4,1978,1,0.5
3,3,4,28916,1,0.5
4,4,4,14427,2,0.5
...,...,...,...,...,...
381380,381380,200151,1702,1,0.5
381381,381381,200151,21632,1,0.5
381382,381382,200151,30477,1,0.5
381383,381383,200151,30477,1,0.5


In [34]:
pred_df = pd.concat([test_cold_start, test])
pred_df = pred_df.sort_values(by=['id']).drop(['id', 'user_id', 'item_id', 'context_feature_id'], axis=1)
pred_df.index = pred_df.index.rename('id')
pred_df.head()

Unnamed: 0_level_0,rating
id,Unnamed: 1_level_1
0,0.5
1,0.5
2,0.5
3,0.5
4,0.5


In [35]:
test

Unnamed: 0,id,user_id,item_id,context_feature_id,rating
30,30,7,1330,1,0.995674
31,31,7,8139,1,0.055683
32,32,7,10370,1,0.023674
33,33,7,21772,1,0.070484
34,34,7,1330,1,0.995674
...,...,...,...,...,...
381370,381370,169696,14615,1,0.075295
381371,381371,169696,601,1,0.986094
381372,381372,169696,6498,1,0.717493
381373,381373,169696,1733,1,0.980622


In [36]:
pred_df

Unnamed: 0_level_0,rating
id,Unnamed: 1_level_1
0,0.5
1,0.5
2,0.5
3,0.5
4,0.5
...,...
381380,0.5
381381,0.5
381382,0.5
381383,0.5


In [37]:
pred_df.to_csv('submission.csv')