In [None]:
import io
import os
import math
import copy
import pickle
import zipfile
from textwrap import wrap
from pathlib import Path
from itertools import zip_longest
from collections import defaultdict
from urllib.error import URLError
from urllib.request import urlopen

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch import optim
from torch.nn import functional as F 
from torch.optim.lr_scheduler import _LRScheduler

from sklearn.metrics.pairwise import cosine_similarity

In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Mounted at /gdrive
/gdrive


In [None]:
examples_df = pd.read_csv('/gdrive/My Drive/hackathon/data/books/examples.csv')
issues_df = pd.read_csv('/gdrive/My Drive/hackathon/data/books/issues.csv')

In [None]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed(RANDOM_STATE)

# Target Preparation

In [None]:
usage_df = examples_df.merge(issues_df, how='inner', on='inventory_id')
usage_df.drop_duplicates(inplace=True)

usage_df = usage_df[['record_id', 'reader_id', 'condition']]
usage_df['condition'] = usage_df['condition'].apply(lambda x: 1 if x in [6544, 6545] else 0)

In [None]:
usage_df.head()

Unnamed: 0,record_id,reader_id,condition
0,1,375196,0
1,1,416672,0
2,1,349736,0
3,1,379610,0
4,1,378839,0


In [None]:
def create_dataset(book_usage):
    unique_users = book_usage.reader_id.unique()
    user_to_index = {old: new for new, old in enumerate(unique_users)}
    new_users = book_usage.reader_id.map(user_to_index)
    
    unique_books = book_usage.record_id.unique()
    book_to_index = {old: new for new, old in enumerate(unique_books)}
    new_books = book_usage.record_id.map(book_to_index)
    
    n_users = unique_users.shape[0]
    n_books = unique_books.shape[0]
    
    X = pd.DataFrame({'user_id': new_users, 'movie_id': new_books})
    y = book_usage['condition'].astype(np.float32)
    return (n_users, n_books), (X, y), (user_to_index, book_to_index)

In [None]:
(n, m), (X, y), _ = create_dataset(usage_df)
print(f'Embeddings: {n} users, {m} books')
print(f'Dataset shape: {X.shape}')
print(f'Target shape: {y.shape}')

Embeddings: 450509 users, 624908 books
Dataset shape: (10418067, 2)
Target shape: (10418067,)


# 

In [None]:
class ConditionIterator:
    
    def __init__(self, X, y, batch_size=32, shuffle=True):
        X, y = np.asarray(X), np.asarray(y)
        
        if shuffle:
            index = np.random.permutation(X.shape[0])
            X, y = X[index], y[index]
            
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.n_batches = int(math.ceil(X.shape[0] // batch_size))
        self._current = 0
        
    def __iter__(self):
        return self
    
    def __next__(self):
        return self.next()
    
    def next(self):
        if self._current >= self.n_batches:
            raise StopIteration()
        k = self._current
        self._current += 1
        bs = self.batch_size
        return self.X[k*bs:(k + 1)*bs], self.y[k*bs:(k + 1)*bs]

In [None]:
def batches(X, y, bs=32, shuffle=True):
    for xb, yb in ConditionIterator(X, y, bs, shuffle):
        xb = torch.LongTensor(xb)
        yb = torch.FloatTensor(yb)
        yield xb, yb.view(-1, 1)

In [None]:
for x_batch, y_batch in batches(X, y, bs=4):
    print(x_batch)
    print(y_batch)
    break

tensor([[ 21493,  27482],
        [142822,   6351],
        [409994, 293722],
        [161283, 548785]])
tensor([[1.],
        [0.],
        [1.],
        [0.]])


In [None]:
class EmbeddingNet(nn.Module):

    def __init__(self, n_users, n_books,
                 n_factors=50, embedding_dropout=0.02, 
                 hidden=10, dropouts=0.2):
        
        super().__init__()
        hidden = get_list(hidden)
        dropouts = get_list(dropouts)
        n_last = hidden[-1]
        
        def gen_layers(n_in):
            nonlocal hidden, dropouts
            assert len(dropouts) <= len(hidden)
            
            for n_out, rate in zip_longest(hidden, dropouts):
                yield nn.Linear(n_in, n_out)
                yield nn.ReLU()
                if rate is not None and rate > 0.:
                    yield nn.Dropout(rate)
                n_in = n_out
            
        self.u = nn.Embedding(n_users, n_factors)
        self.m = nn.Embedding(n_books, n_factors)
        self.dropout = nn.Dropout(embedding_dropout)
        self.hidden = nn.Sequential(*list(gen_layers(n_factors * 2)))
        self.fc = nn.Linear(n_last, 1)
        self._init()
        
    def forward(self, users, books):
        features = torch.cat([self.u(users), self.m(books)], dim=1)
        x = self.dropout(features)
        x = self.hidden(x)
        out = torch.sigmoid(self.fc(x))
        return out
    
    def _init(self):
        def init(m):
            if type(m) == nn.Linear:
                torch.nn.init.xavier_uniform_(m.weight)
                m.bias.data.fill_(0.01)
                
        self.u.weight.data.uniform_(-0.05, 0.05)
        self.m.weight.data.uniform_(-0.05, 0.05)
        self.hidden.apply(init)
        init(self.fc)
    
    
def get_list(n):
    if isinstance(n, (int, float)):
        return [n]
    elif hasattr(n, '__iter__'):
        return list(n)
    raise TypeError('layers configuraiton should be a single number or a list of numbers')

# Training loop

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4, random_state=RANDOM_STATE)
datasets = {'train': (X_train, y_train), 'val': (X_valid, y_valid)}
dataset_sizes = {'train': len(X_train), 'val': len(X_valid)}

In [None]:
net = EmbeddingNet(
    n_users=n, n_books=m, 
    n_factors=25, hidden=[50, 50], 
    embedding_dropout=0.05, dropouts=[0.5, 0.5])

In [None]:
net

EmbeddingNet(
  (u): Embedding(450509, 25)
  (m): Embedding(624908, 25)
  (dropout): Dropout(p=0.05, inplace=False)
  (hidden): Sequential(
    (0): Linear(in_features=50, out_features=50, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=50, out_features=50, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
  )
  (fc): Linear(in_features=50, out_features=1, bias=True)
)

In [None]:
lr = 1e-3
wd = 1e-5
bs = 2000
n_epochs = 100
patience = 10
no_improvements = 0
best_loss = np.inf
best_weights = None
history = []

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

net.to(device)
criterion = nn.BCELoss(reduction='sum')
optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=wd)
iterations_per_epoch = int(math.ceil(dataset_sizes['train'] // bs))

In [None]:
for epoch in range(n_epochs):
    stats = {'epoch': epoch + 1, 'total': n_epochs}
    
    for phase in ('train', 'val'):
        training = phase == 'train'
        running_loss = 0.0
        n_batches = 0
        
        for batch in batches(*datasets[phase], shuffle=training, bs=bs):
            x_batch, y_batch = [b.to(device) for b in batch]
            optimizer.zero_grad()
        
            with torch.set_grad_enabled(training):
                outputs = net(x_batch[:, 0], x_batch[:, 1])
                loss = criterion(outputs, y_batch)
                
                if training:
                    loss.backward()
                    optimizer.step()
                    
            running_loss += loss.item()
            
        epoch_loss = running_loss / dataset_sizes[phase]
        stats[phase] = epoch_loss
        
        if phase == 'val':
            if epoch_loss < best_loss:
                print('loss improvement on epoch: %d' % (epoch + 1))
                best_loss = epoch_loss
                best_weights = copy.deepcopy(net.state_dict())
                no_improvements = 0
            else:
                no_improvements += 1
                
    history.append(stats)
    print('[{epoch:03d}/{total:03d}] train: {train:.4f} - val: {val:.4f}'.format(**stats))
    if no_improvements >= patience:
        print('early stopping after epoch {epoch:03d}'.format(**stats))
        break

loss improvement on epoch: 1
[001/100] train: 0.3832 - val: 0.3557
[002/100] train: 0.3253 - val: 0.3576
[003/100] train: 0.3058 - val: 0.3721
[004/100] train: 0.2914 - val: 0.3882
[005/100] train: 0.2803 - val: 0.4079
[006/100] train: 0.2718 - val: 0.4355
[007/100] train: 0.2652 - val: 0.4585
[008/100] train: 0.2599 - val: 0.4764
[009/100] train: 0.2554 - val: 0.4887
[010/100] train: 0.2516 - val: 0.5155
[011/100] train: 0.2480 - val: 0.5244
early stopping after epoch 011


In [None]:
best_model = net.load_state_dict(best_weights)
torch.save(net, '/content/book_model.model')