In [2]:
import os
import csv
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
from torch import nn, optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

DATA_DIR = './archive/ml-100k'
OUTPUT_DIR = './'

class Config:
    device='cpu'
    epochs=40
    seed=17
    train_bs=8
    valid_bs=8
    embedding_dim=20
    lr=1e-2
    num_workers=None       
    verbose_step=100
    
def torch_seed_everything(seed_value=777):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

config=Config()
torch_seed_everything(config.seed)

# load data

In [3]:
df = pd.read_csv(os.path.join(DATA_DIR, 'u.data'), sep='\t', header=None)
df.columns = ['user_id', 'item_id', 'rating', 'timestamp']
#df = df.sort_values('timestamp').reset_index(drop=True)
n_user = df.user_id.nunique()
n_item = df.item_id.nunique()
df

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [4]:
print('user_num', n_user)
print('item_num', n_item)

user_num 943
item_num 1682


# split data

In [5]:
train_df, valid_df = train_test_split(df, test_size=0.2, stratify=df['user_id'], random_state=config.seed)
assert train_df.user_id.nunique() == valid_df.user_id.nunique()
print(train_df.shape, valid_df.shape)
#print(valid_df.user_id.nunique())

(80000, 4) (20000, 4)


# Dataset

In [6]:
class MovieLensDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        user_id, item_id, rating, _ = self.df.iloc[idx]
        # index starts with 0
        sample = {"user": user_id - 1, "item": item_id - 1, "rating": rating}
        return sample

# model

In [7]:
class MatrixFactorizationPyTorch(nn.Module):
    def __init__(self, n_user, n_item, k=20):
        """
        n_user: user num
        n_item: item num
        k: embedding dim
        """
        super().__init__()
        self.user_factors = nn.Embedding(n_user, k, sparse=True)
        self.item_factors = nn.Embedding(n_item, k, sparse=True)

    def forward(self, user, item):
        #print(user, item)
        u_emb = self.user_factors(user)
        i_emb = self.item_factors(item)
        # print(u_emb.shape, i_emb.shape)
        # print((u_emb * i_emb).shape)
        # print((u_emb * i_emb).sum(axis=1).shape)
        return (u_emb * i_emb).sum(axis=1)

In [8]:
train_loader = DataLoader(MovieLensDataset(train_df), batch_size=2, shuffle=True,)
next(iter(train_loader))

{'user': tensor([416, 335]),
 'item': tensor([1208,   62]),
 'rating': tensor([3, 2])}

In [15]:
data = next(iter(train_loader))
user, item = data['user'], data['item']
model = MatrixFactorizationPyTorch(n_user, n_item, k=config.embedding_dim)
model(user, item)

tensor([547, 223, 523, 118, 589, 888, 278, 206])


# train

In [17]:
def train_one_epoch(epoch, model, loss_fn, optimizer,
                    train_loader, device, scheduler=None):
    model.train()
    pbar = tqdm(enumerate(train_loader), total=len(train_loader))
    data_cnt = 0
    total_loss = 0.0

    # 学習データをシャッフルしてループ
    for step, data in pbar:
        user = data['user']
        item = data['item']
        rating = data['rating']
        data_cnt += user.shape[0]

        # 勾配リセット
        optimizer.zero_grad()

        #順伝搬、逆伝搬
        outputs = model(user, item)
        #print('outupts', outputs)
        #print(rating)
        loss = loss_fn(outputs,  rating.float())
        #print('loss', loss)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        #print(total_loss)
        if ((step + 1) % config.verbose_step == 0) or ((step + 1) == len(train_loader)):
            description = f'train epoch {epoch} loss: {total_loss / data_cnt:.4f}'
            pbar.set_description(description)

    total_loss = total_loss / len(train_loader)
    print('train loss = {:.4f}'.format(total_loss))

def valid_one_epoch(epoch, model, loss_fn, val_loader, device):

    model.eval()
    total_loss = 0.0
    data_cnt = 0
    #preds = []
    pbar = tqdm(enumerate(val_loader), total=len(val_loader))

    for step, data in pbar:
        user = data['user']
        item = data['item']
        rating = data['rating']
        data_cnt += user.shape[0]

        outputs = model(user, item)
        loss = loss_fn(outputs, rating)
        total_loss += loss
        
        # preds.append(outputs.detach().cpu().numpy())

        if ((step + 1) % config.verbose_step == 0) or ((step + 1) == len(val_loader)):
            description = f'val epoch {epoch} loss: {total_loss / data_cnt:.4f}'
            pbar.set_description(description)
        

    valid_loss = total_loss / len(val_loader)
    print('val loss = {:.4f}'.format(valid_loss))
    return valid_loss 

def run_train(train_loader, valid_loader):
    device = torch.device(config.device)
    model = MatrixFactorizationPyTorch(n_user, n_item, k=config.embedding_dim)
    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=config.lr)
    best_loss=1e10
    for epoch in range(config.epochs):
        train_one_epoch(epoch, model, loss_fn, optimizer, train_loader, device)

        with torch.no_grad():
            val_loss = valid_one_epoch(epoch, model, loss_fn, valid_loader, device)
        
        if best_loss > val_loss:
            best_loss = val_loss
            best_rmse = torch.sqrt(best_loss)
            best_epoch = epoch
            # TODO: save model,  figure
            best_path =  os.path.join(OUTPUT_DIR,f'best_model.bin')
            torch.save({'model':model.state_dict(),},
                           best_path)
    print(f'----- result ------')
    print(f'Best epoch: {epoch}')
    print(f'Best loss: {best_loss}, RMSE: {best_rmse}')

In [11]:
train_loader = DataLoader(MovieLensDataset(train_df), batch_size=config.train_bs, shuffle=True,)
valid_loader = DataLoader(MovieLensDataset(valid_df), batch_size=config.valid_bs, shuffle=False,)
run_train(train_loader, valid_loader)

train epoch 0 loss: 3.2454: 100%|██████████| 10000/10000 [00:14<00:00, 694.64it/s]


train loss = 25.9629


val epoch 0 loss: 2.6013: 100%|██████████| 2500/2500 [00:01<00:00, 1771.44it/s]


val loss = 20.8102


train epoch 1 loss: 2.0078: 100%|██████████| 10000/10000 [00:10<00:00, 957.16it/s]


train loss = 16.0626


val epoch 1 loss: 1.9293: 100%|██████████| 2500/2500 [00:01<00:00, 1822.36it/s]


val loss = 15.4343


train epoch 2 loss: 1.2412: 100%|██████████| 10000/10000 [00:09<00:00, 1021.15it/s]


train loss = 9.9296


val epoch 2 loss: 1.2213: 100%|██████████| 2500/2500 [00:01<00:00, 1589.90it/s]


val loss = 9.7702


train epoch 3 loss: 0.7166: 100%|██████████| 10000/10000 [00:10<00:00, 915.06it/s]


train loss = 5.7325


val epoch 3 loss: 0.8394: 100%|██████████| 2500/2500 [00:01<00:00, 1735.66it/s]


val loss = 6.7152


train epoch 4 loss: 0.4744: 100%|██████████| 10000/10000 [00:10<00:00, 924.78it/s]


train loss = 3.7949


val epoch 4 loss: 0.6483: 100%|██████████| 2500/2500 [00:01<00:00, 1776.02it/s]


val loss = 5.1861


train epoch 5 loss: 0.3518: 100%|██████████| 10000/10000 [00:10<00:00, 962.73it/s]


train loss = 2.8143


val epoch 5 loss: 0.5387: 100%|██████████| 2500/2500 [00:01<00:00, 1616.84it/s]


val loss = 4.3097


train epoch 6 loss: 0.2810: 100%|██████████| 10000/10000 [00:10<00:00, 977.65it/s]


train loss = 2.2483


val epoch 6 loss: 0.4689: 100%|██████████| 2500/2500 [00:01<00:00, 1870.36it/s]


val loss = 3.7510


train epoch 7 loss: 0.2361: 100%|██████████| 10000/10000 [00:10<00:00, 953.43it/s]


train loss = 1.8886


val epoch 7 loss: 0.4210: 100%|██████████| 2500/2500 [00:01<00:00, 1831.30it/s]


val loss = 3.3678


train epoch 8 loss: 0.2055: 100%|██████████| 10000/10000 [00:10<00:00, 988.12it/s]


train loss = 1.6442


val epoch 8 loss: 0.3862: 100%|██████████| 2500/2500 [00:01<00:00, 1545.07it/s]


val loss = 3.0894


train epoch 9 loss: 0.1837: 100%|██████████| 10000/10000 [00:09<00:00, 1017.85it/s]


train loss = 1.4698


val epoch 9 loss: 0.3604: 100%|██████████| 2500/2500 [00:01<00:00, 1621.67it/s]


val loss = 2.8831


train epoch 10 loss: 0.1675: 100%|██████████| 10000/10000 [00:10<00:00, 920.45it/s]


train loss = 1.3397


val epoch 10 loss: 0.3400: 100%|██████████| 2500/2500 [00:01<00:00, 1651.05it/s]


val loss = 2.7202


train epoch 11 loss: 0.1550: 100%|██████████| 10000/10000 [00:10<00:00, 943.43it/s]


train loss = 1.2398


val epoch 11 loss: 0.3238: 100%|██████████| 2500/2500 [00:01<00:00, 1673.10it/s]


val loss = 2.5908


train epoch 12 loss: 0.1451: 100%|██████████| 10000/10000 [00:09<00:00, 1000.55it/s]


train loss = 1.1611


val epoch 12 loss: 0.3108: 100%|██████████| 2500/2500 [00:01<00:00, 1754.40it/s]


val loss = 2.4860


train epoch 13 loss: 0.1371: 100%|██████████| 10000/10000 [00:10<00:00, 957.35it/s]


train loss = 1.0971


val epoch 13 loss: 0.2995: 100%|██████████| 2500/2500 [00:01<00:00, 1797.15it/s]


val loss = 2.3959


train epoch 14 loss: 0.1306: 100%|██████████| 10000/10000 [00:10<00:00, 960.98it/s]


train loss = 1.0449


val epoch 14 loss: 0.2902: 100%|██████████| 2500/2500 [00:01<00:00, 1617.49it/s]


val loss = 2.3217


train epoch 15 loss: 0.1251: 100%|██████████| 10000/10000 [00:10<00:00, 934.21it/s]


train loss = 1.0009


val epoch 15 loss: 0.2825: 100%|██████████| 2500/2500 [00:01<00:00, 1835.25it/s]


val loss = 2.2599


train epoch 16 loss: 0.1205: 100%|██████████| 10000/10000 [00:09<00:00, 1039.15it/s]


train loss = 0.9638


val epoch 16 loss: 0.2756: 100%|██████████| 2500/2500 [00:01<00:00, 1573.16it/s]


val loss = 2.2049


train epoch 17 loss: 0.1165: 100%|██████████| 10000/10000 [00:08<00:00, 1164.98it/s]


train loss = 0.9319


val epoch 17 loss: 0.2697: 100%|██████████| 2500/2500 [00:01<00:00, 1955.92it/s]


val loss = 2.1575


train epoch 18 loss: 0.1131: 100%|██████████| 10000/10000 [00:08<00:00, 1177.13it/s]


train loss = 0.9045


val epoch 18 loss: 0.2643: 100%|██████████| 2500/2500 [00:01<00:00, 1746.80it/s]


val loss = 2.1144


train epoch 19 loss: 0.1101: 100%|██████████| 10000/10000 [00:08<00:00, 1179.15it/s]


train loss = 0.8804


val epoch 19 loss: 0.2597: 100%|██████████| 2500/2500 [00:01<00:00, 2008.13it/s]


val loss = 2.0779


train epoch 20 loss: 0.1074: 100%|██████████| 10000/10000 [00:08<00:00, 1186.35it/s]


train loss = 0.8591


val epoch 20 loss: 0.2555: 100%|██████████| 2500/2500 [00:01<00:00, 1990.90it/s]


val loss = 2.0438


train epoch 21 loss: 0.1050: 100%|██████████| 10000/10000 [00:08<00:00, 1149.09it/s]


train loss = 0.8402


val epoch 21 loss: 0.2520: 100%|██████████| 2500/2500 [00:01<00:00, 1703.42it/s]


val loss = 2.0161


train epoch 22 loss: 0.1029: 100%|██████████| 10000/10000 [00:08<00:00, 1174.06it/s]


train loss = 0.8233


val epoch 22 loss: 0.2486: 100%|██████████| 2500/2500 [00:01<00:00, 1988.62it/s]


val loss = 1.9887


train epoch 23 loss: 0.1010: 100%|██████████| 10000/10000 [00:08<00:00, 1167.45it/s]


train loss = 0.8081


val epoch 23 loss: 0.2457: 100%|██████████| 2500/2500 [00:01<00:00, 1850.36it/s]


val loss = 1.9657


train epoch 24 loss: 0.0993: 100%|██████████| 10000/10000 [00:08<00:00, 1186.26it/s]


train loss = 0.7942


val epoch 24 loss: 0.2428: 100%|██████████| 2500/2500 [00:01<00:00, 1993.62it/s]


val loss = 1.9425


train epoch 25 loss: 0.0977: 100%|██████████| 10000/10000 [00:08<00:00, 1180.84it/s]


train loss = 0.7817


val epoch 25 loss: 0.2405: 100%|██████████| 2500/2500 [00:01<00:00, 1891.79it/s]


val loss = 1.9236


train epoch 26 loss: 0.0963: 100%|██████████| 10000/10000 [00:08<00:00, 1161.69it/s]


train loss = 0.7701


val epoch 26 loss: 0.2383: 100%|██████████| 2500/2500 [00:01<00:00, 1990.94it/s]


val loss = 1.9063


train epoch 27 loss: 0.0949: 100%|██████████| 10000/10000 [00:08<00:00, 1129.87it/s]


train loss = 0.7595


val epoch 27 loss: 0.2362: 100%|██████████| 2500/2500 [00:01<00:00, 1674.65it/s]


val loss = 1.8896


train epoch 28 loss: 0.0937: 100%|██████████| 10000/10000 [00:08<00:00, 1143.29it/s]


train loss = 0.7498


val epoch 28 loss: 0.2342: 100%|██████████| 2500/2500 [00:01<00:00, 1981.43it/s]


val loss = 1.8739


train epoch 29 loss: 0.0926: 100%|██████████| 10000/10000 [00:08<00:00, 1189.18it/s]


train loss = 0.7407


val epoch 29 loss: 0.2326: 100%|██████████| 2500/2500 [00:01<00:00, 1999.75it/s]


val loss = 1.8611


train epoch 30 loss: 0.0915: 100%|██████████| 10000/10000 [00:08<00:00, 1173.15it/s]


train loss = 0.7324


val epoch 30 loss: 0.2310: 100%|██████████| 2500/2500 [00:01<00:00, 1853.00it/s]


val loss = 1.8478


train epoch 31 loss: 0.0906: 100%|██████████| 10000/10000 [00:08<00:00, 1177.88it/s]


train loss = 0.7244


val epoch 31 loss: 0.2295: 100%|██████████| 2500/2500 [00:01<00:00, 1982.76it/s]


val loss = 1.8356


train epoch 32 loss: 0.0896: 100%|██████████| 10000/10000 [00:08<00:00, 1182.58it/s]


train loss = 0.7170


val epoch 32 loss: 0.2281: 100%|██████████| 2500/2500 [00:01<00:00, 1916.84it/s]


val loss = 1.8250


train epoch 33 loss: 0.0887: 100%|██████████| 10000/10000 [00:08<00:00, 1151.23it/s]


train loss = 0.7100


val epoch 33 loss: 0.2269: 100%|██████████| 2500/2500 [00:01<00:00, 1996.91it/s]


val loss = 1.8150


train epoch 34 loss: 0.0879: 100%|██████████| 10000/10000 [00:08<00:00, 1189.91it/s]


train loss = 0.7032


val epoch 34 loss: 0.2256: 100%|██████████| 2500/2500 [00:01<00:00, 2003.91it/s]


val loss = 1.8047


train epoch 35 loss: 0.0872: 100%|██████████| 10000/10000 [00:09<00:00, 1104.24it/s]


train loss = 0.6975


val epoch 35 loss: 0.2245: 100%|██████████| 2500/2500 [00:01<00:00, 1918.75it/s]


val loss = 1.7963


train epoch 36 loss: 0.0864: 100%|██████████| 10000/10000 [00:08<00:00, 1185.89it/s]


train loss = 0.6914


val epoch 36 loss: 0.2235: 100%|██████████| 2500/2500 [00:01<00:00, 2000.75it/s]


val loss = 1.7881


train epoch 37 loss: 0.0857: 100%|██████████| 10000/10000 [00:08<00:00, 1172.77it/s]


train loss = 0.6858


val epoch 37 loss: 0.2228: 100%|██████████| 2500/2500 [00:01<00:00, 1836.39it/s]


val loss = 1.7821


train epoch 38 loss: 0.0851: 100%|██████████| 10000/10000 [00:08<00:00, 1174.52it/s]


train loss = 0.6807


val epoch 38 loss: 0.2216: 100%|██████████| 2500/2500 [00:01<00:00, 1989.12it/s]


val loss = 1.7732


train epoch 39 loss: 0.0845: 100%|██████████| 10000/10000 [00:08<00:00, 1186.32it/s]


train loss = 0.6757


val epoch 39 loss: 0.2209: 100%|██████████| 2500/2500 [00:01<00:00, 1996.06it/s]

val loss = 1.7674
----- result ------
Best epoch: 39
Best loss: 1.7673780918121338, RMSE: 1.329427719116211





# get recommendation

In [12]:
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
item_df = pd.read_csv(os.path.join(DATA_DIR, 'u.item'), sep='|', encoding="iso-8859-1", usecols=range(5), names=m_cols)
item_df.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995)


In [13]:
def load_model():
    best_path = os.path.join(OUTPUT_DIR, 'best_model.bin')
    model = MatrixFactorizationPyTorch(n_user, n_item, k=config.embedding_dim)
    model.load_state_dict(torch.load(best_path)['model'])
    return model

def predict_rating(rec_df):
    model = load_model()
    model.eval()
    dataloader = DataLoader(MovieLensDataset(rec_df), batch_size=10, shuffle=False,)
    pbar = tqdm(dataloader, total=len(dataloader))
    preds = []
    for data in pbar:
        user_id = data['user']
        item_id = data['item']
        rating = data['rating']

        preds += model(user_id, item_id)

    return torch.stack(preds).detach().numpy()

def recommend_for_user(user_id, rating_df, item_df, top_n=10):
    # Extract data that user have never seen.
    rec_df = rating_df.query("user_id != @user_id")
    rec_df['user_id'] = user_id
    rec_df = rec_df.drop_duplicates(subset=['user_id','item_id'])

    # predict rating
    rec_df['rating'] = predict_rating(rec_df)
    
    # clip rating
    # I don't know if this is the right way...
    rec_df = rec_df.query('0.5 <= rating <= 5.5 ')

    # add title column 
    d = dict(zip(item_df.movie_id, item_df.title))
    rec_df['title'] = rec_df['item_id'].map(d)
    rec_df = rec_df.sort_values('rating', ascending=False)

    # show recommended movies
    print('-'*30 + 'recommendations' + '-'*30)
    print(rec_df[['title','rating']].head(top_n))
#     for i, row in rec_df.head(top_n).iterrows():
#         title, rating = row['title'],row['rating']
#         print(f'{i:}: title:{title}  score:{rating}')

    # show movies which user have watched before
    user_df = rating_df.query("user_id == @user_id")
    user_df['title'] = user_df['item_id'].map(d)
    user_df = user_df.sort_values('rating', ascending=False)

    print('-'*30 + 'watched_movies' + '-'*30)
    print(user_df[['title','rating']].head(top_n))
#     for i, row in user_df.head(top_n).iterrows():
#         title, rating = row['title'], row['rating']
#         print(f'{i}: title:{title}  score:{rating}')



In [14]:
user_id = random.choice(df.user_id.values)
print(user_id)
recommend_for_user(user_id, df, item_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_df['user_id'] = user_id


726


100%|██████████| 169/169 [00:00<00:00, 1255.21it/s]

------------------------------recommendations------------------------------
                                  title    rating
1669                  Kalifornia (1993)  5.499646
2039             Man of the Year (1995)  5.496773
608    Island of Dr. Moreau, The (1996)  5.487639
14316              Shooting Fish (1997)  5.444240
2923            Daytrippers, The (1996)  5.411543
327              MatchMaker, The (1997)  5.379460
24927                      Bliss (1997)  5.372055
4156                 Bulletproof (1996)  5.363232
511                   Home Alone (1990)  5.349087
22974                Schizopolis (1996)  5.335552
------------------------------watched_movies------------------------------
                                                   title  rating
97778                                       Bogus (1996)       5
89183                                   Liar Liar (1997)       5
63948  Don't Be a Menace to South Central While Drink...       5
77922                                 B


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_df['title'] = rec_df['item_id'].map(d)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_df['title'] = user_df['item_id'].map(d)
