In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from tqdm import tqdm

In [2]:
data = pd.read_csv('data/cleaned_data.csv', index_col=0)
data = data[data.session_id <= 20000]
y = data['not_skipped']
X = data.drop(['skip_1', 'skip_2','skip_3', 'not_skipped', 'hist_user_behavior_reason_end', 'mode'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [116]:
base_learners = [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=3)]
num_estimators = [50, 100, 150]

best_score = 0
best_model = None
for learner in base_learners:
    for num_estimator in num_estimators:
        model = AdaBoostClassifier(base_estimator=learner, n_estimators=num_estimator)
        cv = cross_validate(model, X, y, cv=5)
        val_score = cv['test_score'].mean()
        print(f'Base Learner {learner}, Number of Estimators: {num_estimator}')
        print(f'Average Validation Set Accuracy: {val_score}')
        print()
        if  val_score > best_score:
            best_score = val_score
            best_model = model

Base Learner DecisionTreeClassifier(max_depth=1), Number of Estimators: 50
Average Validation Set Accuracy: 0.676054493210945

Base Learner DecisionTreeClassifier(max_depth=1), Number of Estimators: 100
Average Validation Set Accuracy: 0.6765011497308066

Base Learner DecisionTreeClassifier(max_depth=1), Number of Estimators: 150
Average Validation Set Accuracy: 0.6762053899601713

Base Learner DecisionTreeClassifier(max_depth=3), Number of Estimators: 50
Average Validation Set Accuracy: 0.6793561447371882

Base Learner DecisionTreeClassifier(max_depth=3), Number of Estimators: 100
Average Validation Set Accuracy: 0.6153694119071731

Base Learner DecisionTreeClassifier(max_depth=3), Number of Estimators: 150
Average Validation Set Accuracy: 0.6147295995917379



In [117]:
best_model.fit(X_train, y_train)
pred = best_model.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

print(best_model)
print('Accuracy: ', (tn+tp)/(tp+tn+fp+fn))
print(f'Recall: {tp/(tp+fn)}, Precision: {tp/(tp+fp)}')

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3))
Accuracy:  0.7965426495086796
Recall: 0.5677045087384345, Precision: 0.7527017817155097


In [118]:
class RecurrentDataset(Dataset):
    def __init__(self, data, idxs):
        self.data = data
        self.idxs =  idxs
    
    def __len__(self):
        return len(self.idxs)
    
    def __getitem__(self, idx):
        session = data[data.session_id==self.idxs[idx]]
        y = session['not_skipped']
        x = session.drop(['skip_1', 'skip_2','skip_3', 'not_skipped', 'session_id', 'hist_user_behavior_reason_end', 'mode'], axis=1)
        return torch.Tensor(np.array(x)), torch.Tensor(np.array(y))

        
        

In [121]:
idxs = [i for i in range(int(data.session_id.max()) + 1)]
train_val_idxs, test_idxs = train_test_split(idxs)
train_idxs, val_idxs = train_test_split(train_val_idxs)
train_data = RecurrentDataset(data, train_idxs)
val_data = RecurrentDataset(data, val_idxs)
test_data = RecurrentDataset(data, test_idxs)

train_dl = DataLoader(train_data, num_workers=8)
val_dl = DataLoader(val_data, num_workers=8)
test_dl = DataLoader(test_data, num_workers=8)

In [102]:
class ManyToManyRNN(nn.Module):
    def __init__(self, num_recurrent, hidden_size, lr=.0001):
        super(ManyToManyRNN, self).__init__()
        self.rnn = nn.RNN(41, hidden_size, num_layers=num_recurrent)
        
        self.ffn = nn.Linear(hidden_size, 1)
        self.criterion = nn.BCEWithLogitsLoss()
        self.optim = torch.optim.Adam(self.parameters(), lr=lr)
        
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)
        
    def forward(self, x):
        embeds, _inner = self.rnn(x)
        return self.ffn(embeds)
    
    def train(self, dl, epochs=10):
        for epoch in range(epochs):
            for x, y in tqdm(dl):
                x, y = x.to(self.device), y.to(self.device) 
                self.optim.zero_grad()
                
                # get ouputs, unsqueeze it
                out = self.forward(x)
                out = torch.flatten(out).unsqueeze(0)
                
                # calculate loss, backprop it, then update epoch loss
                loss = self.criterion(out, y)
                loss.backward()
                
                self.optim.step()
                
                
    def evaluate(self, dl, threshold):
        with torch.no_grad():
            tps = 0
            model_pos = 0
            gold_pos = 0
            num_correct = 0
            total_entries = 0
            
            for x, y in dl:
                x = x.to(self.device)
                y = y.numpy().squeeze()
                pred = torch.sigmoid(self(x)).cpu().numpy().squeeze()
                pred[pred> threshold] = 1
                pred[pred<= threshold] = 0
                model_pos += pred.sum()
                gold_pos += y.sum()
                matches = np.zeros(len(pred))
                matches[np.logical_and(pred, y)] = 1
                
                num_correct += np.sum(pred==y)
                tps += matches.sum()
                total_entries += len(y)
                
            model_pos += 1
            prec = tps/model_pos
            recall = tps/gold_pos
            return prec, recall, num_correct/total_entries
        

In [97]:
hidden_sizes = [50, 100, 1000]
nums_rnns = [1, 3, 5]
thresholds = [.15, .25, .5]

for hidden_size in hidden_sizes:
    for num_rnns in nums_rnns:
        model = ManyToManyRNN(num_rnns, hidden_size)
        model.train(train_dl, epochs=15)
        for threshold in thresholds:
            prec, recall, acc = model.evaluate(val_dl, threshold)
            print(f"Hidden Size {hidden_size}, Number of Recurrent Layers: {num_rnns}, Decision Threshold: {threshold}")
            print(f'Recall: {recall}, Precision: {prec}, Accuracy: {acc}')

Hidden Size 50, Number of Recurrent Layers: 1, Decision Threshold: 0.15
Recall: 1.0, Precision: 0.33306331410796447, Accuracy: 0.3330741064774311
Hidden Size 50, Number of Recurrent Layers: 1, Decision Threshold: 0.25
Recall: 0.5219379317054188, Precision: 0.7675250357653791, Accuracy: 0.7881468520138686
Hidden Size 50, Number of Recurrent Layers: 1, Decision Threshold: 0.5
Recall: 0.4359373479910497, Precision: 0.8322808320950966, Accuracy: 0.7828975081818477
Hidden Size 50, Number of Recurrent Layers: 3, Decision Threshold: 0.15
Recall: 1.0, Precision: 0.33306331410796447, Accuracy: 0.3330741064774311
Hidden Size 50, Number of Recurrent Layers: 3, Decision Threshold: 0.25
Recall: 0.9917307131043875, Precision: 0.33411996066863325, Accuracy: 0.3389715174492077
Hidden Size 50, Number of Recurrent Layers: 3, Decision Threshold: 0.5
Recall: 0.0, Precision: 0.0, Accuracy: 0.6669258935225689
Hidden Size 50, Number of Recurrent Layers: 5, Decision Threshold: 0.15
Recall: 1.0, Precision: 0.3

In [126]:
model = ManyToManyRNN(1, 100)
model.train(train_dl, epochs=30)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11250/11250 [00:37<00:00, 300.70it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11250/11250 [00:36<00:00, 307.35it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11250/11250 [00:36<00:00, 311.94it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11250/11250 [00:36<00:00, 309.76it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11250/11250 [00:35<00:00, 317.74it/s]
100%|██████████████████████████████████████████████████████████████████████

In [128]:
model.evaluate(test_dl, .3)

(0.692079940784604, 0.616573856975381, 0.7826108016468879)