In [None]:
import io
import os
import math
import copy
import pickle
import zipfile
from textwrap import wrap
from pathlib import Path
from itertools import zip_longest
from collections import defaultdict
from urllib.error import URLError
from urllib.request import urlopen

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch import optim
from torch.nn import functional as F 
from torch.optim.lr_scheduler import _LRScheduler

from sklearn.metrics.pairwise import cosine_similarity

In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [None]:
df_mr_o = pd.read_csv('/gdrive/My Drive/hackathon/data/clf/MegaRelation_hackaton.csv', sep=";")

In [None]:
df_pupil_o = pd.read_csv('/gdrive/My Drive/hackathon/data/clf/Pupil_hackaton.csv', sep=";")

In [None]:
lllll = df_pupil_o.columns.to_list()
print(lllll)

['id_ученика', 'возраст', 'пол']


In [None]:
df_pupil = df_pupil_o.drop(['возраст','пол'], axis=1)

In [None]:
df_pupil.columns = ['id_student']

In [None]:
df_mr = df_mr_o.drop(['id_зачисления',
                        'Дата_создания_записи', 
                        'id_организации', 
                        'id_заявления', 
                        'дата_зачисления', 
                        'дата_отчисления', 
                        'причина_перевода', 
                        'предыдущая_запись_зачисления', 
                        'следующая_запись_зачисления', 
                        'Плановая_дата_начала_занятий', 
                        'Плановая_дата_окончания_занятий'], axis=1)

In [None]:
df_mr.columns = ['status','id_student','id_service']

In [None]:
df_mr['status'].unique()

array([ 3., nan,  2.,  1.])

In [None]:
df_mr['status'].value_counts(dropna=False)

2.0    288103
3.0    198300
1.0    161374
NaN     77036
Name: status, dtype: int64

In [None]:
df_pupil.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2742631 entries, 0 to 2742630
Data columns (total 1 columns):
 #   Column      Dtype
---  ------      -----
 0   id_student  int64
dtypes: int64(1)
memory usage: 20.9 MB


In [None]:
df_mr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 724813 entries, 0 to 724812
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   status      647777 non-null  float64
 1   id_student  647777 non-null  float64
 2   id_service  724813 non-null  int64  
dtypes: float64(2), int64(1)
memory usage: 16.6 MB


In [None]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed(RANDOM_STATE)

# Target Preparation

In [None]:
usage_df = df_pupil.merge(df_mr, how='inner', on='id_student')
usage_df.drop_duplicates(inplace=True)

usage_df = usage_df[['status', 'id_student', 'id_service']]
usage_df['status'] = usage_df['status'].apply(lambda x: 1 if x in [1] else 0)

In [None]:
usage_df.head()


Unnamed: 0,status,id_student,id_service
0,0,1,40555
1,0,1,144929
2,1,1,144929
3,0,1,144984
5,0,2,40555


In [None]:
def create_dataset(book_usage):
    unique_users = book_usage.id_student.unique()
    user_to_index = {old: new for new, old in enumerate(unique_users)}
    new_users = book_usage.id_student.map(user_to_index)
    
    unique_books = book_usage.id_service.unique()
    book_to_index = {old: new for new, old in enumerate(unique_books)}
    new_books = book_usage.id_service.map(book_to_index)
    
    n_users = unique_users.shape[0]
    n_books = unique_books.shape[0]
    
    X = pd.DataFrame({'user_id': new_users, 'movie_id': new_books})
    y = book_usage['status'].astype(np.float32)
    return (n_users, n_books), (X, y), (user_to_index, book_to_index)

In [None]:
(n, m), (X, y), _ = create_dataset(usage_df)
print(f'Embeddings: {n} users, {m} books')
print(f'Dataset shape: {X.shape}')
print(f'Target shape: {y.shape}')

Embeddings: 289169 users, 26678 books
Dataset shape: (540736, 2)
Target shape: (540736,)


# 

In [None]:
class ConditionIterator:
    
    def __init__(self, X, y, batch_size=32, shuffle=True):
        X, y = np.asarray(X), np.asarray(y)
        
        if shuffle:
            index = np.random.permutation(X.shape[0])
            X, y = X[index], y[index]
            
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.n_batches = int(math.ceil(X.shape[0] // batch_size))
        self._current = 0
        
    def __iter__(self):
        return self
    
    def __next__(self):
        return self.next()
    
    def next(self):
        if self._current >= self.n_batches:
            raise StopIteration()
        k = self._current
        self._current += 1
        bs = self.batch_size
        return self.X[k*bs:(k + 1)*bs], self.y[k*bs:(k + 1)*bs]

In [None]:
def batches(X, y, bs=32, shuffle=True):
    for xb, yb in ConditionIterator(X, y, bs, shuffle):
        xb = torch.LongTensor(xb)
        yb = torch.FloatTensor(yb)
        yield xb, yb.view(-1, 1)

In [None]:
for x_batch, y_batch in batches(X, y, bs=4):
    print(x_batch)
    print(y_batch)
    break

tensor([[ 85868,   3846],
        [247721,    493],
        [ 67820,  14322],
        [178434,  23181]])
tensor([[0.],
        [1.],
        [0.],
        [0.]])


In [None]:
class EmbeddingNet(nn.Module):

    def __init__(self, n_users, n_books,
                 n_factors=20, embedding_dropout=0.02, 
                 hidden=10, dropouts=0.2):
        
        super().__init__()
        hidden = get_list(hidden)
        dropouts = get_list(dropouts)
        n_last = hidden[-1]
        
        def gen_layers(n_in):
            nonlocal hidden, dropouts
            assert len(dropouts) <= len(hidden)
            
            for n_out, rate in zip_longest(hidden, dropouts):
                yield nn.Linear(n_in, n_out)
                yield nn.ReLU()
                if rate is not None and rate > 0.:
                    yield nn.Dropout(rate)
                n_in = n_out
            
        self.u = nn.Embedding(n_users, n_factors)
        self.m = nn.Embedding(n_books, n_factors)
        self.dropout = nn.Dropout(embedding_dropout)
        self.hidden = nn.Sequential(*list(gen_layers(n_factors * 2)))
        self.fc = nn.Linear(n_last, 1)
        self._init()
        
    def forward(self, users, books):
        features = torch.cat([self.u(users), self.m(books)], dim=1)
        x = self.dropout(features)
        x = self.hidden(x)
        out = torch.sigmoid(self.fc(x))
        return out
    
    def _init(self):
        def init(m):
            if type(m) == nn.Linear:
                torch.nn.init.xavier_uniform_(m.weight)
                m.bias.data.fill_(0.01)
                
        self.u.weight.data.uniform_(-0.05, 0.05)
        self.m.weight.data.uniform_(-0.05, 0.05)
        self.hidden.apply(init)
        init(self.fc)
    
    
def get_list(n):
    if isinstance(n, (int, float)):
        return [n]
    elif hasattr(n, '__iter__'):
        return list(n)
    raise TypeError('layers configuraiton should be a single number or a list of numbers')

# Training loop

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4, random_state=RANDOM_STATE)
datasets = {'train': (X_train, y_train), 'val': (X_valid, y_valid)}
dataset_sizes = {'train': len(X_train), 'val': len(X_valid)}

In [None]:
net = EmbeddingNet(
    n_users=n, n_books=m, 
    n_factors=50, hidden=[200, 200, 200], 
    embedding_dropout=0.05, dropouts=[0.5, 0.5, 0.25])

In [None]:
net

EmbeddingNet(
  (u): Embedding(289169, 50)
  (m): Embedding(26678, 50)
  (dropout): Dropout(p=0.05, inplace=False)
  (hidden): Sequential(
    (0): Linear(in_features=100, out_features=200, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=200, out_features=200, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=200, out_features=200, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.25, inplace=False)
  )
  (fc): Linear(in_features=200, out_features=1, bias=True)
)

In [None]:
lr = 1e-5
wd = 1e-5
bs = 2000
n_epochs = 25
patience = 5
no_improvements = 0
best_loss = np.inf
best_weights = None
history = []

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

net.to(device)
criterion = nn.BCELoss(reduction='sum')
optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=wd)
iterations_per_epoch = int(math.ceil(dataset_sizes['train'] // bs))

In [None]:
for epoch in range(n_epochs):
    stats = {'epoch': epoch + 1, 'total': n_epochs}
    
    for phase in ('train', 'val'):
        training = phase == 'train'
        running_loss = 0.0
        n_batches = 0
        
        for batch in batches(*datasets[phase], shuffle=training, bs=bs):
            x_batch, y_batch = [b.to(device) for b in batch]
            optimizer.zero_grad()
        
            # compute gradients only during 'train' phase
            with torch.set_grad_enabled(training):
                outputs = net(x_batch[:, 0], x_batch[:, 1])
                loss = criterion(outputs, y_batch)
                
                # don't update weights and rates when in 'val' phase
                if training:
                    loss.backward()
                    optimizer.step()
                    
            running_loss += loss.item()
            
        epoch_loss = running_loss / dataset_sizes[phase]
        stats[phase] = epoch_loss
        
        # early stopping: save weights of the best model so far
        if phase == 'val':
            if epoch_loss < best_loss:
                print('loss improvement on epoch: %d' % (epoch + 1))
                best_loss = epoch_loss
                best_weights = copy.deepcopy(net.state_dict())
                no_improvements = 0
            else:
                no_improvements += 1
                
    history.append(stats)
    print('[{epoch:03d}/{total:03d}] train: {train:.4f} - val: {val:.4f}'.format(**stats))
    if no_improvements >= patience:
        print('early stopping after epoch {epoch:03d}'.format(**stats))
        break

loss improvement on epoch: 1
[001/025] train: 0.6928 - val: 0.6773
loss improvement on epoch: 2
[002/025] train: 0.6628 - val: 0.6487
loss improvement on epoch: 3
[003/025] train: 0.6349 - val: 0.6226
loss improvement on epoch: 4
[004/025] train: 0.6109 - val: 0.6024
loss improvement on epoch: 5
[005/025] train: 0.5956 - val: 0.5917
loss improvement on epoch: 6
[006/025] train: 0.5880 - val: 0.5872
loss improvement on epoch: 7
[007/025] train: 0.5845 - val: 0.5844
loss improvement on epoch: 8
[008/025] train: 0.5812 - val: 0.5808
loss improvement on epoch: 9
[009/025] train: 0.5769 - val: 0.5764
loss improvement on epoch: 10
[010/025] train: 0.5713 - val: 0.5704
loss improvement on epoch: 11
[011/025] train: 0.5639 - val: 0.5623
loss improvement on epoch: 12
[012/025] train: 0.5538 - val: 0.5518
loss improvement on epoch: 13
[013/025] train: 0.5403 - val: 0.5369
loss improvement on epoch: 14
[014/025] train: 0.5221 - val: 0.5177
loss improvement on epoch: 15
[015/025] train: 0.4983 - v

# Saving model

In [None]:
best_model = net.load_state_dict(best_weights)
torch.save(best_weights, '/content/best_weights.pt')