In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import os
import sys
sys.path.append('..')

In [3]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Subset
from tqdm import tqdm, trange

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegressionCV

In [5]:
from models.cnn import MyAlexNet
from models.resnet import ResNetBaseline
from modules.my_torch.helpers import train_one_epoch, eval, test, test_binary, test_regression, _test
from modules.eval.eval import dataset_train_test
from dataset import AccelLaughterDataset
from constants import cloud_data_path

In [6]:
examples = pd.read_csv('../dataset/computational_examples.csv')
examples = examples[examples['condition'] == 'video']
accel_ds_path = os.path.join(cloud_data_path, 'accel', 'accel_ds.pkl')
ds = AccelLaughterDataset(examples, accel_ds_path, label='intensity', example_len=60)

loaded 672 examples
467 have accel


In [7]:
examples.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,person,cam,hit_id,condition,calibration,hash,ini_time,end_time,...,gt_offset,gt_laughter,is_laughter,confidence,intensity,attempt,pressed_key,onset,offset,rating_hash
0,0,0,25,1,9c45e4f0c5442e796eb93e73e94dc6c2dfca7b9c4c54ff...,video,False,1170917790b51bc5a8dacacc4d8ed8c410b7ea6bb7ea4b...,7360.29,7361.54,...,4.420238,True,True,7,4,0,True,3.33667,6.639973,7af591213b827db95c12c56e76e0b1fe518f2088d11aad...
1,1947,1947,25,1,bff9b86d833a595e6fe5a54f45093fa168cda45db1143e...,video,False,1170917790b51bc5a8dacacc4d8ed8c410b7ea6bb7ea4b...,7360.29,7361.54,...,4.420238,True,True,1,6,0,True,2.569236,6.639973,25df21dc0f25e11a7c4aba77e502269d42a7bb548044f2...
6,979,979,35,3,f4c9842cec7be99eeaaea36d0c7d077c4d5d94596dc731...,video,False,11bc9d8aca57ab2aef4c5305b080fa49c08665d9e94190...,2216.02,2216.54,...,3.92886,True,False,3,4,0,False,,,e56c0edb6a0870ea94b570507d246cb56c9ab3b1919a05...
7,2744,2744,35,3,eecc0cf5d634ce45a98cbbda30c922f2a2cfcb1877124c...,video,False,11bc9d8aca57ab2aef4c5305b080fa49c08665d9e94190...,2216.02,2216.54,...,3.92886,True,False,5,4,0,False,,,4c6a7bdcbec4e912d140f9a9ae4a196e0b55f0e5eff6b3...
12,2,2,1,4,9c45e4f0c5442e796eb93e73e94dc6c2dfca7b9c4c54ff...,video,False,c1d181e74dbdbce1e51d7d0bfd6e036913896dd1f22856...,3346.3,3347.7,...,3.255518,True,True,2,2,0,True,2.736069,3.203203,07d4ee10402ea059d0a3791fd35fbaab20149aeb6ffb99...


In [8]:
(len(ds.examples_df), len(ds.accel))

(467, 359)

In [9]:
ds.accel['006f74addfc99845bf6c9f80d13d52ccc189341031525530762bb83dd8b713af'].shape

(49, 3)

In [10]:
def get_metrics(outputs, labels, model):
    if model in ['bce']:
        proba = torch.sigmoid(outputs)
        pred = (proba > 0.5)

        correct = pred.eq(outputs.bool()).sum().item()
        return {
            'auc': roc_auc_score(labels, proba),
            'correct': correct
        }
    elif model in ['l1', 'mse', 'mean_baseline']:
        return {
            'mse': torch.nn.functional.mse_loss(outputs, labels, reduction='mean'),
            'l1': torch.nn.functional.l1_loss(outputs, labels, reduction='mean')
        }

In [23]:
def do_fold(train_idx, test_idx, model='bce', logfile=None):
    # create datasets    
    train_ds = Subset(ds, train_idx)
    test_ds = Subset(ds, test_idx)
    
    # data loaders
    data_loader = torch.utils.data.DataLoader(
        train_ds, batch_size=100, shuffle=True, num_workers=0,
        collate_fn=None)
    data_loader_val = torch.utils.data.DataLoader(
        test_ds, batch_size=100, shuffle=False, num_workers=0,
        collate_fn=None)

    if model in ['bce', 'l1', 'mse']:
        return do_fold_cnn(data_loader, data_loader_val, model, logfile)
    elif model in ['mean_baseline']:
        return do_fold_mean_baseline(data_loader, data_loader_val)

def do_fold_mean_baseline(train_dl, test_dl):
    labels = []
    for batch_idx, (X, Y) in enumerate(train_dl):
        Y = Y.float()
        labels.append(Y.reshape(-1))

    labels = torch.cat(labels)
    return torch.full((len(test_dl.dataset),), labels.mean().item())

def do_fold_cnn(train_dl, test_dl, loss='bce', logfile=None):
    
    # model = MyAlexNet()
    model = ResNetBaseline(in_channels = 3)
    if loss == 'bce':
        loss_fn = torch.nn.BCEWithLogitsLoss(reduction='sum')
    elif loss == 'mse':
        loss_fn = torch.nn.MSELoss(reduction='sum')
    elif loss == 'l1':
        loss_fn = torch.nn.L1Loss(reduction='sum')
    else:
        raise Exception('unknown loss')
    optimizer = torch.optim.Adam(model.parameters(), lr=.001)
    
    device = torch.device('cpu')
    model = model.to(device)
    
    for epoch in range(50):
        try:
            train_one_epoch(model, loss_fn, device, train_dl, optimizer, epoch)
            eval_labels, eval_output, stats = _test(model, loss_fn, device, test_dl)
            eval_metrics = get_metrics(eval_output, eval_labels, loss)
            if logfile is not None:
                logfile.write(str(eval_metrics)+'\n')
        except KeyboardInterrupt:
            pass
        
    # testing
    all_labels, all_output, stats = _test(model, loss_fn, device, test_dl)

    metrics = get_metrics(all_output, all_labels, loss)
    return all_output

In [24]:

def do_run(model):
    seed = 22
    cv_splits = KFold(n_splits=4, random_state=seed, shuffle=True).split(range(len(ds)))

    fh = open('run_logs.log', 'w')

    outputs = torch.empty((len(ds),))
    for f, (train_idx, test_idx) in enumerate(cv_splits):
        fold_outputs = do_fold(train_idx, test_idx, model, logfile=fh)
        outputs[test_idx] = fold_outputs

    labels = torch.Tensor(ds.get_all_labels())
    run_metrics = get_metrics(outputs, labels, model)
    fh.close()
    print(run_metrics)

    return outputs

In [25]:
_=do_run('mse')

{'mse': tensor(2.5420), 'l1': tensor(1.2098)}


In [26]:
_=do_run('l1')

{'mse': tensor(2.2997), 'l1': tensor(1.1001)}


In [33]:
_=do_run('mean_baseline')

{'mse': tensor(1.7031), 'l1': tensor(0.9845)}
