In [3]:
import torch.nn as nn
import torch.autograd as autograd
import torch
import pandas as pd
from pathlib import Path
import numpy as np
import torch.optim as optim
import torch.nn.functional as F
%matplotlib inline

# Logistic MF with PyTorch

## read data

In [4]:
PATH = Path("data_hw2")
list(PATH.iterdir())


[PosixPath('data_hw2/default-credit-card-clients.csv'),
 PosixPath('data_hw2/train_books_ratings.csv'),
 PosixPath('data_hw2/valid_books_ratings.csv')]

In [5]:
train = pd.read_csv('data_hw2/train_books_ratings.csv')
val = pd.read_csv('data_hw2/valid_books_ratings.csv')

In [6]:
train

Unnamed: 0,user,item,rating,timestamp
0,A2IIIDRK3PRRZY,0000000116,0,1395619200
1,A9KTKY6BUR8U6,0000013714,0,1357516800
2,A35OP02LIXZ84E,0000477141,0,1399939200
3,A9WX8DK93SN5,000100039X,0,1385683200
4,A36JQ1WC5JQPFQ,000100039X,0,1391990400
...,...,...,...,...
1787552,A2NJYJ4NRI5BMS,1860542859,1,1218931200
1787553,A1D3XQC2DIT,0061703257,1,1208476800
1787554,A29G5X33I71N83,0615680046,1,1356393600
1787555,ASSJ6F0DU3YOR,1570612609,1,968371200


## encoding data

### encode user

In [7]:
train_user_ids = np.sort(np.unique(train.user.values))
train_user_ids[:15]

array(['A000096625CHSNKYTYGZN', 'A00027561NC7JTXEP3EOD',
       'A0002802PGRRB05CR0VT', 'A00031045Q68JAQ1UYT',
       'A00034485ZR6O60DSTB', 'A000474048I5ERWOT4F1',
       'A000546612R3DNRC8556S', 'A00066243R8D11GEHJID0',
       'A00069023W30DWQJNBSPS', 'A00084501WU69W4PMQJWJ',
       'A00085162GMCAJ3DQHUMY', 'A00105581RTVW6FDVGPKJ',
       'A0010876CNE3ILIM9HV0', 'A00109803PZJ91RLT7DPN',
       'A001116435Y409YSMCZKW'], dtype=object)

In [8]:
userid2idx = {o:i for i,o in enumerate(train_user_ids)}
num_users = len(userid2idx)
print(num_users)

1312778


In [9]:
train["user"] = train["user"].apply(lambda x: userid2idx[x])
train.head()

Unnamed: 0,user,item,rating,timestamp
0,527409,0000000116,0,1395619200
1,1059073,0000013714,0,1357516800
2,750064,0000477141,0,1399939200
3,1062362,000100039X,0,1385683200
4,758289,000100039X,0,1391990400


In [10]:
val["user"] = val["user"].apply(lambda x: userid2idx.get(x, -1)) # -1 for users not in training
val.head()

Unnamed: 0,user,item,rating,timestamp
0,-1,000100039X,0,1393286400
1,-1,000100039X,0,1371772800
2,-1,000100039X,0,1372118400
3,-1,000100039X,0,1387152000
4,1093526,000100039X,0,1392336000


In [11]:
val = val[val["user"] >= 0].copy()
val.head()

Unnamed: 0,user,item,rating,timestamp
4,1093526,000100039X,0,1392336000
5,1172263,000100039X,0,1022025600
6,1114879,0001712772,0,1277337600
9,563238,0001714538,0,1367280000
19,6532,0002007770,0,1336867200


### encode item

In [12]:
train_item_ids = np.sort(np.unique(train.item.values))
num_items = len(train_item_ids)
print(num_items)
train_item_ids[:15]

659279


array(['0000000116', '0000013714', '0000477141', '000100039X',
       '0001053655', '0001203010', '0001360000', '0001473123',
       '0001473905', '0001501232', '000161102X', '0001711296',
       '000171130X', '0001712772', '000171287X'], dtype=object)

In [13]:
item2idx = {o:i for i,o in enumerate(train_item_ids)}
train["item"] = train["item"].apply(lambda x: item2idx[x])
val["item"] = val["item"].apply(lambda x: item2idx.get(x, -1))

In [14]:
val = val[val["item"] >= 0].copy()
val.head()

Unnamed: 0,user,item,rating,timestamp
4,1093526,3,0,1392336000
5,1172263,3,0,1022025600
6,1114879,13,0,1277337600
19,6532,34,0,1336867200
24,416115,34,0,1390176000


## MF with bias

In [15]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        # init 
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return torch.sigmoid((U*V).sum(1) +  b_u  + b_v)

In [16]:
def valid_loss(model):
    model.eval()
    users = torch.LongTensor(val.user.values) # .cuda()
    items = torch.LongTensor(val.item.values) #.cuda()
    ratings = torch.FloatTensor(val.rating.values) #.cuda()
    y_hat = model(users, items)
    loss = F.binary_cross_entropy(y_hat, ratings)
    return loss.item()

In [17]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    for i in range(epochs):
        model.train()
        users = torch.LongTensor(train.user.values)  #.cuda()
        items = torch.LongTensor(train.item.values) #.cuda()
        ratings = torch.FloatTensor(train.rating.values)  #.cuda()
    
        y_hat = model(users, items)
        loss = F.binary_cross_entropy(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        testloss = valid_loss(model)
        print("train loss %.3f valid loss %.3f" % (loss.item(), testloss)) 

In [None]:
def valid_metrics(model, valid_df):
    """Computes validation loss and accuracy"""
    model.eval()
    ### BEGIN SOLUTION
    users = torch.LongTensor(valid_df.user.values) # .cuda()
    items = torch.LongTensor(valid_df.item.values) #.cuda()
    ratings = torch.FloatTensor(valid_df.rating.values) #.cuda()
    y_hat = model(users, items)
    valid_loss = F.binary_cross_entropy(y_hat, ratings)
    valid_loss = valid_loss.detach().numpy()
    valid_loss = valid_loss.item()
    y_hat = y_hat > 0.5#tensor of true or false
    valid_acc = (ratings == y_hat).sum().item() / ratings.size(0)
    ### END SOLUTION
    return valid_loss, valid_acc

In [18]:
model = MF_bias(num_users, num_items, emb_size=100) #.cuda()

In [19]:
train_epocs(model, epochs=15, lr=0.12, wd=1e-5)

train loss 0.694 valid loss 0.778
train loss 0.713 valid loss 0.633
train loss 0.618 valid loss 0.642
train loss 0.649 valid loss 0.676
train loss 0.685 valid loss 0.649
train loss 0.660 valid loss 0.621
train loss 0.630 valid loss 0.613
train loss 0.610 valid loss 0.619
train loss 0.603 valid loss 0.620
train loss 0.602 valid loss 0.614
train loss 0.603 valid loss 0.611
train loss 0.611 valid loss 0.614
train loss 0.622 valid loss 0.620
train loss 0.630 valid loss 0.622
train loss 0.631 valid loss 0.618


In [20]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, seed=23):
        super(MF, self).__init__()
        torch.manual_seed(seed)
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        # init 
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)

    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return torch.sigmoid((U*V).sum(1) +  b_u  + b_v)
    
def train_one_epoch(model, train_df, optimizer):
    """ Trains the model for one epoch"""
    model.train()
    ### BEGIN SOLUTION
    users = torch.LongTensor(train_df.user.values)  #.cuda()#64 bit int
    items = torch.LongTensor(train_df.item.values) #.cuda()
    ratings = torch.FloatTensor(train_df.rating.values)  #.cuda()
    y_hat = model(users, items)
    loss = F.binary_cross_entropy(y_hat, ratings)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    ### END SOLUTION
    return loss.item()


def valid_metrics(model, valid_df):
    """Computes validation loss and accuracy"""
    model.eval()
    ### BEGIN SOLUTION
    users = torch.LongTensor(valid_df.user.values) # .cuda()
    items = torch.LongTensor(valid_df.item.values) #.cuda()
    ratings = torch.FloatTensor(valid_df.rating.values) #.cuda()
    y_hat = model(users, items)
    valid_loss = F.binary_cross_entropy(y_hat, ratings)
    valid_loss = valid_loss.detach().numpy()
    valid_loss = valid_loss.item()
    y_hat = y_hat > 0.5#tensor of true or false
    valid_acc = (ratings == y_hat).sum().item() / ratings.size(0)
    ### END SOLUTION
    return valid_loss, valid_acc


def training(model, train_df, valid_df, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    for i in range(epochs):
        train_loss = train_one_epoch(model, train_df, optimizer)
        valid_loss, valid_acc = valid_metrics(model, valid_df) 
        print("train loss %.3f valid loss %.3f valid acc %.3f" % (train_loss, valid_loss, valid_acc)) 



In [21]:
training(model, epochs=15, lr=0.12, wd=1e-5)

TypeError: training() missing 2 required positional arguments: 'train_df' and 'valid_df'