In [51]:
import numpy as np
import pandas as pd

In [52]:
from processing_missing_data import ProcessingMissingData
import pickle
## Processing the dataset with the class ProcessingMissingData

## Load train data
## Process the data with the sampler and segmenter
with open('Data\\train_inputs.pkl', 'rb') as handle:
    list_of_trains = pickle.load(handle)

processing_missing_data_obj = ProcessingMissingData()
list_of_missing_vals_processed_trains_concat = processing_missing_data_obj.get_processed_dataset_as_list_of_vectors(list_of_data=list_of_trains, type='mean')

list_of_missing_preprocessed_trains = processing_missing_data_obj.get_processed_dataset(list_of_data=list_of_trains, type='mean')

In [58]:
from person_processing import PersonProcessing
import pickle

## Process the data with the sampler and segmenter
with open('Data\\train_inputs.pkl', 'rb') as handle:
    list_of_trains = pickle.load(handle)


# RAW TRAINS or WITHOUT MISSING DATA TRAINS
# curr_list_of_trains = list_of_trains # RAW
curr_list_of_trains = list_of_missing_preprocessed_trains # Without missing data

# parameter k
param_k = 13   
# dict with key k number of samples taken and value list of vectors for that particular k
# dict with key k number of segmentations taken and value list of vectors for that particular k
dict_of_k_samples_features = dict()
dict_of_k_segmentations_features = dict()
for train_sample in curr_list_of_trains:
    for curr_k in range(2,param_k+1):
        person_processing_obj = PersonProcessing(train_sample)
        results_sampling = person_processing_obj.get_sampling(k=curr_k)
        results_segmentation = person_processing_obj.get_segmentation(k=curr_k)

        if curr_k in dict_of_k_samples_features:
            dict_of_k_samples_features[curr_k].append(results_sampling)
        else:
            dict_of_k_samples_features[curr_k] = list()
            dict_of_k_samples_features[curr_k].append(results_sampling)

        if curr_k in dict_of_k_segmentations_features:
            dict_of_k_segmentations_features[curr_k].append(results_segmentation)
        else:
            dict_of_k_segmentations_features[curr_k] = list()
            dict_of_k_segmentations_features[curr_k].append(results_segmentation)

In [59]:
## Process the labels
with open('Data\\train_outputs.pkl', 'rb') as handle:
    list_of_train_labels_raw = pickle.load(handle)

In [60]:
list_of_train_labels = list()
for item in list_of_train_labels_raw:
    label_idx = (item[0] == 1).nonzero()[0][0]
    list_of_train_labels.append(label_idx)

In [71]:
## Neural Network Architecture
from torch import nn

class JapaneseVowelsNN(nn.Module):
    def __init__(self, feature_dim, num_classes):
        super(JapaneseVowelsNN, self).__init__()
        
        self.input_layer = nn.Linear(feature_dim, feature_dim*2)
        self.hidden_layer = nn.Linear(feature_dim*2, feature_dim)
        self.hidden_layer2 = nn.Linear(feature_dim, int(feature_dim/2))
        self.output_layer = nn.Linear(int(feature_dim/2), num_classes) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self.batchnorm1 = nn.BatchNorm1d(feature_dim*2)
        self.batchnorm2 = nn.BatchNorm1d(feature_dim)
        self.batchnorm3 = nn.BatchNorm1d(int(feature_dim/2))
        
    def forward(self, x):
        x = self.input_layer(x)
        x = self.batchnorm1(x)
        x = self.relu(x)
        
        x = self.hidden_layer(x)
        x = self.batchnorm2(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.hidden_layer2(x)
        x = self.batchnorm3(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.output_layer(x)
        
        return x

In [72]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold

## Function for creating the data loaders and making train validate sets
def prepare_dataset(type='segmentation', k=2):
    if type=='segmentation':
        dataset = list(zip(torch.from_numpy(np.array(dict_of_k_segmentations_features[k])), torch.from_numpy(np.array                         (list_of_train_labels))))
    elif type=='sampling':
        dataset = list(zip(torch.from_numpy(np.array(dict_of_k_samples_features[k])), torch.from_numpy(np.array                         (list_of_train_labels))))

    elif type=='processed_missing_values':
        dataset = list(zip(torch.from_numpy(np.array(list_of_missing_vals_processed_trains_concat)), 
        torch.from_numpy(np.array(list_of_train_labels))))

    return dataset

def multi_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
    
    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    
    acc = torch.round(acc) * 100
    
    return acc

def create_cross_validator(n_splits=10):
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    return cv

def create_data_loaders(train_data, val_data):
    train_data_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=False)
    val_data_loader = DataLoader(dataset=val_data, batch_size=1, shuffle=False)
    return train_data_loader, val_data_loader

In [73]:
## Constant Parameters
EPOCHS = 200
BATCH_SIZE = 32
LEARNING_RATE = 0.001
NUM_CLASSES = 9
# Possible types: "segmentation","sampling","processed_missing_values"
TYPE_OF_FEATURES = "processed_missing_values"
## Set to zero if type="processed_missing_values"
if TYPE_OF_FEATURES == "processed_missing_values":
    K = 0
    LOOP_START_AT = 0
else:
    K = param_k
    LOOP_START_AT = 10

In [75]:
from torch import optim
import torch
import numpy as np 

## Loss function and Optimizer
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
dict_of_k_results = dict()

cross_validator = create_cross_validator()

for k in range(LOOP_START_AT,K+1):
        # Prepare data
        prepared_data = prepare_dataset(type=TYPE_OF_FEATURES, k=k)
        k_fold_num = 0
        for train_idx, val_idx in cross_validator.split(prepared_data):
            k_fold_num += 1
            # Create data loaders for train/val for the current cross validation
            train_loader, val_loader = create_data_loaders(
                train_data=prepared_data[train_idx[0]:train_idx[-1]] + [prepared_data[train_idx[-1]]], 
                val_data=prepared_data[val_idx[0]:val_idx[-1]] + [prepared_data[val_idx[-1]]])

            """
            train_data=prepared_data[train_idx[0]:train_idx[-1]] + [prepared_data[train_idx[-1]]], 
                val_data=prepared_data[val_idx[0]:val_idx[-1]] + [prepared_data[val_idx[-1]]])
            """
            # get feature dimensionality to set in the network
            feature_dim = next(iter(train_loader))[0].shape[1]

            ## Initialize model
            model = JapaneseVowelsNN(feature_dim=feature_dim, num_classes=NUM_CLASSES)
            model.to(device)
            model = model.double()

            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

            for epoch in range(1, EPOCHS+1):
                # TRAINING
                train_epoch_loss = 0
                train_epoch_acc = 0
                model.train()
                for X_train_batch, y_train_batch in train_loader:
                    X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
                    optimizer.zero_grad()
                    
                    y_train_pred = model(X_train_batch)
                    
                    train_loss = criterion(y_train_pred, y_train_batch)
                    train_acc = multi_acc(y_train_pred, y_train_batch)
                    
                    train_loss.backward()
                    optimizer.step()
                    
                    train_epoch_loss += train_loss.item()
                    train_epoch_acc += train_acc.item()
                    
                    
                # VALIDATION    
                with torch.no_grad():
                    
                    val_epoch_loss = 0
                    val_epoch_acc = 0
                    
                    model.eval()
                    for X_val_batch, y_val_batch in val_loader:
                        X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
                        
                        y_val_pred = model(X_val_batch)
                                    
                        val_loss = criterion(y_val_pred, y_val_batch)
                        val_acc = multi_acc(y_val_pred, y_val_batch)
                        
                        val_epoch_loss += val_loss.item()
                        val_epoch_acc += val_acc.item()
                
                if epoch % 10 == 0:
                    print("Epoch: {} | Train Loss: {} |  Val Loss: {} | Train acc: {} | Val acc: {}".format(epoch,                         round(train_epoch_loss/len(train_loader),3), round(val_epoch_loss/len(val_loader),3), round                                  (train_epoch_acc/len(train_loader),3), round(val_epoch_acc/len(val_loader),3)))
                if epoch == EPOCHS:
                    print('--------k param: {} k-fold num: {} completed!-----------'.format(k, k_fold_num))
        
            ## Check if key exists and create it if it doesnt and saves the results for the current experiment
            if k not in dict_of_k_results:
                dict_of_k_results[k] = {'train_acc':[], 'train_loss':[], 'val_acc':[],'val_loss':[],                                   'type':TYPE_OF_FEATURES}
            dict_of_k_results[k]['train_acc'].append(train_epoch_acc/len(train_loader))
            dict_of_k_results[k]['train_loss'].append(train_epoch_loss/len(train_loader))
            dict_of_k_results[k]['val_acc'].append(val_epoch_acc/len(val_loader))
            dict_of_k_results[k]['val_loss'].append(val_epoch_loss/len(val_loader))

Epoch: 10 | Train Loss: 1.001 |  Val Loss: 0.87 | Train acc: 100.0 | Val acc: 79.231
Epoch: 20 | Train Loss: 0.535 |  Val Loss: 0.669 | Train acc: 100.0 | Val acc: 83.846
Epoch: 30 | Train Loss: 0.282 |  Val Loss: 0.619 | Train acc: 100.0 | Val acc: 82.308
Epoch: 40 | Train Loss: 0.195 |  Val Loss: 0.564 | Train acc: 100.0 | Val acc: 83.846
Epoch: 50 | Train Loss: 0.12 |  Val Loss: 0.456 | Train acc: 100.0 | Val acc: 86.538
Epoch: 60 | Train Loss: 0.092 |  Val Loss: 0.476 | Train acc: 100.0 | Val acc: 83.846
Epoch: 70 | Train Loss: 0.062 |  Val Loss: 0.471 | Train acc: 100.0 | Val acc: 85.769
Epoch: 80 | Train Loss: 0.054 |  Val Loss: 0.455 | Train acc: 100.0 | Val acc: 84.615
Epoch: 90 | Train Loss: 0.043 |  Val Loss: 0.469 | Train acc: 100.0 | Val acc: 86.923
Epoch: 100 | Train Loss: 0.032 |  Val Loss: 0.393 | Train acc: 100.0 | Val acc: 85.769
Epoch: 110 | Train Loss: 0.029 |  Val Loss: 0.347 | Train acc: 100.0 | Val acc: 88.846
Epoch: 120 | Train Loss: 0.024 |  Val Loss: 0.401 | Tr

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

for i in range(LOOP_START_AT,K+1):
    plt.plot(range(1,EPOCHS+1), dict_of_k_results[i]['train_acc'], label = "k={} train_acc".format(i))
    plt.plot(range(1,EPOCHS+1), dict_of_k_results[i]['val_acc'], label = "k={} val_acc".format(i))


plt.xlabel('Epochs')
plt.ylabel('Accuracy')

plt.legend()
plt.title('Train/Val accuracy for the Simple Sampling Method with different K values')

In [None]:
import torch
test_data = list(zip(torch.from_numpy(np.array(dict_of_k_segmentations_features[2])), torch.from_numpy(np.array                         (list_of_train_labels))))

In [None]:
test_data[29]

In [39]:
dict_of_k_results

{0: {'train_acc': [100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0],
  'train_loss': [0.010303964244054019,
   0.009727635107466354,
   0.010104277753388149,
   0.009428879611421019,
   0.010095687376055639,
   0.010764576285894147,
   0.0105047551766339,
   0.009358374187253412,
   0.010209640391649457,
   0.012007208510833206],
  'val_acc': [82.3076923076923,
   86.23481781376518,
   81.9672131147541,
   88.50574712643679,
   87.25868725868726,
   83.2089552238806,
   89.1213389121339,
   86.36363636363636,
   86.45418326693228,
   89.2],
  'val_loss': [0.4568403902935249,
   0.41255618324486537,
   0.4979995437250016,
   0.3639029543412922,
   0.41543104372471873,
   0.40654747530175184,
   0.2933670118896055,
   0.3763686953448697,
   0.3672758545948656,
   0.3595154498639108],
  'type': 'processed_missing_values'}}

In [40]:
# Save as pickles
with open('Results\\dict_of_k_results_cv10.pkl', 'wb') as handle:
    pickle.dump(dict_of_k_results, handle, protocol=pickle.HIGHEST_PROTOCOL)