# ML 2021 Homework 2-1

- Author : Liang-Cheng Chen 陳亮丞
- Mail : r08323022@ntu.edu.tw



## Synopsis

### Train

I test three DNN model setups, also I test different ways to split train/validate sets.

I find models with more hidden layers and dropout perform better. 

Additionally, using randomly split train/validate data does not give us better results. A guess here is that the order of data may be related to the speakers. Therefore using randomly split sets may let validation data be too similar with the training data.
I also find adding batchnorm to each layer does not give me better prediction.

In the final version, I let three models decide the predicted value with majority vote. (Yet this did not yield better result than model 3.)

### Post-Processing

I find the prediction values appear to last for several frames. (each row is a frame, a phoneme consists of multiple frame).
Hence, I use `post_process` function to correct the prediction yielded by the model.


### Ref

I follow the same way to build the `TIMITDataset`, and the same way to calculate batch loss as the code provided by the course TA, the [link] is [here](https://colab.research.google.com/github/ga642381/ML2021-Spring/blob/main/HW02/HW02-1.ipynb).


## Model and Tools


In [2]:
# Modules
import numpy as np
import torch
import torch.nn as nn
import csv
from torch.utils.data import Dataset, DataLoader
import gc

### Some Utilities

In [3]:
def train_val_split(X, y, shuffle=False, ratio=0.2):
    '''
    train/validation split with val ratio provided and data shuffled.
    '''

    if shuffle:
        idx = np.random.permutation(X.shape[0])
        X, y = X[idx], y[idx]

    percent = int(X.shape[0]*(1-ratio))

    return X[:percent], y[:percent], X[percent:], y[percent:]


#check device
def get_device():
  return 'cuda' if torch.cuda.is_available() else 'cpu'

# fix random seed
def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  
    np.random.seed(seed)  
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


### Dataset

In [4]:
class TIMITDataset(Dataset):

    def __init__(self, X, y):
        self.data = torch.from_numpy(X).float()
        
        if y is not None:
            y = y.astype(int) 
            self.label = torch.LongTensor(y)
        else:
            self.label = None

    
    def __getitem__(self, index):

        if self.label is not None:
            # Train and Val dataset
            return self.data[index], self.label[index]
        else:
            # Test dataset
            return self.data[index]


    def __len__(self):
        return len(self.data)

### Model 
In the `TIMITClassifier` object I define three network setups and implement loss function with L2-regularization.

In [5]:
class TIMITClassifier(nn.Module):
    def __init__(self, mode=1):
        super().__init__()


        self.mode = mode
        self.mpath = "./model{}.pth".format(mode)

        # model architecture
        self.net1 = nn.Sequential(
            nn.Linear(429, 1024),
            nn.Sigmoid(),
            nn.Dropout(p=0.2),

            nn.Linear(1024, 1024),
            nn.Sigmoid(),
            nn.Dropout(p=0.2),

            nn.Linear(1024, 512),
            nn.Sigmoid(),
            nn.Dropout(p=0.2),

            nn.Linear(512, 128),
            nn.Sigmoid(),
            nn.Linear(128, 39)
        )

        self.net2 = nn.Sequential(
            nn.Linear(429, 1024),
            nn.BatchNorm1d(1024),
            nn.Sigmoid(),
            nn.Dropout(p=0.2),

            nn.Linear(1024, 1024),
            nn.BatchNorm1d(1024),
            nn.Sigmoid(),
            nn.Dropout(p=0.2),

            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.Sigmoid(),
            nn.Dropout(p=0.2),

            nn.Linear(512, 128),
            nn.BatchNorm1d(128),
            nn.Sigmoid(),
            nn.Linear(128, 39)
        )

        self.net3 = nn.Sequential(
            nn.Linear(429, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(p=0.2),

            nn.Linear(1024, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(p=0.2),

            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(p=0.2),

            nn.Linear(512, 128),
            nn.BatchNorm1d(128),
            nn.Sigmoid(),
            nn.Linear(128, 39)
        )


        # loss function
        self.criterion = nn.CrossEntropyLoss()


    def cal_loss(self, pred, y, L2=False):
        '''
        Loss with L2-regularization.
        '''
        if L2:

            l2_lambda = 0.0001
            l2_reg = 0

            for param in model.parameters():
                l2_reg += torch.sum(param.pow(2))

            loss = self.criterion(pred, y) + l2_lambda * l2_reg

        else:
            loss = self.criterion(pred, y)

        return loss


    def forward(self, x):

        if self.mode == 1:
            return self.net1(x)
        elif self.mode == 2:
            return self.net2(x)
        else:
            return self.net3(x)


### Train

In [6]:
def train(model, train_loader, val_loader, num_epoch, lr, device, L2=False):

    optim = torch.optim.Adam(model.parameters(), lr=lr)

    best_acc = 0.0
    for epoch in range(num_epoch):

        train_acc = 0.0
        val_acc   = 0.0
        train_loss = 0.0
        val_loss   = 0.0
        

        # training part
        model.train()
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            
            optim.zero_grad()                       # remove previous grad
            outputs = model(x)                      # prediction prob. in (1, 39) shape
            loss = model.cal_loss(outputs, y, L2)       # calculate loss
            loss.backward()                         # back propagation
            optim.step()                            # update params

            _, pred = torch.max(outputs, 1)         # get category with max prob., only need the index

            # update total loss
            train_acc += (pred.cpu() == y.cpu()).sum().item()
            train_loss += loss.item()

        
        # validation
        model.eval()
        
        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(device), y.to(device)
                outputs = model(x)
                loss = model.cal_loss(outputs, y, L2)
                _, pred = torch.max(outputs, dim=1)

                val_acc += (pred.cpu() == y.cpu()).sum().item()
                val_loss += loss.item()

            print('[{:03d}/{:03d}] Train Acc: {:3.6f} Loss: {:3.6f} | Val Acc: {:3.6f} loss: {:3.6f}'.format(
                    epoch + 1, num_epoch, train_acc/len(train_set), train_loss/len(train_loader), val_acc/len(val_set), val_loss/len(val_loader)
            ))

            if val_acc > best_acc:
                best_acc = val_acc
                torch.save(model.state_dict(), model.mpath)
                print("saving model with acc: {:.3f}".format(best_acc/len(val_set)))

    return model

### Test

In [7]:
def predict(model, test_loader, device):

    predicts = []
    model.eval()
    with torch.no_grad():
        for x in test_loader:
            x = x.to(device)
            outputs = model(x)
            _, pred = torch.max(outputs, 1)

            for y in pred.cpu().numpy():
                predicts.append(y)

    return predicts


def save_pred(pred:list, filepath:str):

    with open(filepath, 'w') as f:
        f.write('Id,Class\n')
        for i, y in enumerate(pred):
            f.write('{},{}\n'.format(i, y))

    print('Finish saving prediction at {}'.format(filepath))


### Post-Processing
The `post_process` function takes prediction list as input. It loops over each row, check whether the value is the same within `n`-window, then return a new prediction list.

In [8]:
def post_process(pred:list, window=2):

    posts = []
    new_pred = pred[:]

    for i, p in enumerate(new_pred):
    
        if i <= window or i + window >= (len(new_pred) - 1):
            continue
        
        
        tmp = new_pred[i-window:i] + new_pred[i+1:i+(window+1)]

        if len(set(tmp)) == 1 and p not in set(tmp):
            posts.append({"i" : i, "value" : p})
            new_pred[i] = tmp[0]


    return new_pred

## Data

In [9]:
!gdown --id '1HmENtrgZO1C13YM1mRenwDUvLCKX0ehu' --output data.zip
!unzip data.zip

Downloading...
From: https://drive.google.com/uc?id=1HmENtrgZO1C13YM1mRenwDUvLCKX0ehu
To: /content/data.zip
372MB [00:06, 61.6MB/s]
Archive:  data.zip
   creating: timit_11/
  inflating: timit_11/train_11.npy   
  inflating: timit_11/test_11.npy    
  inflating: timit_11/train_label_11.npy  


In [10]:
pre = 'timit_11/'
label_raw = np.load(pre + 'train_label_11.npy')
train_raw = np.load(pre +'train_11.npy')
test_raw  = np.load(pre + 'test_11.npy')

In [11]:
train_x, train_y, val_x, val_y = train_val_split(train_raw, label_raw, shuffle=False, ratio=0.2)

In [12]:
BATCH_SIZE = 64
train_set = TIMITDataset(train_x, train_y)
val_set   = TIMITDataset(val_x, val_y)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False)

In [13]:
del train_raw, label_raw, train_x, train_y, val_x, val_y
gc.collect()

61

## Train

In [14]:
# set up hyper params

NUM_EPOCH = 20
LR = 0.0001
device = get_device()


In [16]:
# initialize model and train
same_seeds(3)
model = TIMITClassifier(mode=3).to(device)

_ = train(model, train_loader, val_loader, NUM_EPOCH, LR, device)

[001/020] Train Acc: 0.580011 Loss: 1.494414 | Val Acc: 0.685276 loss: 1.014293
saving model with acc: 0.685
[002/020] Train Acc: 0.659675 Loss: 1.085380 | Val Acc: 0.707525 loss: 0.912328
saving model with acc: 0.708
[003/020] Train Acc: 0.681216 Loss: 0.999055 | Val Acc: 0.716302 loss: 0.872746
saving model with acc: 0.716
[004/020] Train Acc: 0.695114 Loss: 0.945637 | Val Acc: 0.724876 loss: 0.837717
saving model with acc: 0.725
[005/020] Train Acc: 0.705356 Loss: 0.906166 | Val Acc: 0.730933 loss: 0.816636
saving model with acc: 0.731
[006/020] Train Acc: 0.714728 Loss: 0.873916 | Val Acc: 0.734827 loss: 0.801216
saving model with acc: 0.735
[007/020] Train Acc: 0.721775 Loss: 0.847594 | Val Acc: 0.735787 loss: 0.798758
saving model with acc: 0.736
[008/020] Train Acc: 0.727827 Loss: 0.824674 | Val Acc: 0.738824 loss: 0.785220
saving model with acc: 0.739
[009/020] Train Acc: 0.733522 Loss: 0.804724 | Val Acc: 0.741438 loss: 0.780324
saving model with acc: 0.741
[010/020] Train Acc

In [17]:
# prepare test dataloader
test_set = TIMITDataset(test_raw, None)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)

In [18]:
# reload model
model = TIMITClassifier(mode=3).to(device)
model.load_state_dict(torch.load(model.mpath))

# predict
pred = predict(model, test_loader, device)
save_pred(pred, 'old_pred_3.csv')

Finish saving prediction at old_pred_3.csv


Then we repeat above steps for different models.

In [None]:
# initialize model and train
same_seeds(2)
model = TIMITClassifier(mode=2).to(device)

_ = train(model, train_loader, val_loader, NUM_EPOCH, LR, device)

In [None]:
# reload model
model = TIMITClassifier(mode=2).to(device)
model.load_state_dict(torch.load(model.mpath))

# predict
pred = predict(model, test_loader, device)
save_pred(pred, 'old_pred_2.csv')

In [None]:
# initialize model and train
same_seeds(1)
model = TIMITClassifier(mode=1).to(device)

_ = train(model, train_loader, val_loader, NUM_EPOCH, LR, device)

# reload model
model = TIMITClassifier(mode=1).to(device)
model.load_state_dict(torch.load(model.mpath))

# predict
pred = predict(model, test_loader, device)
save_pred(pred, 'old_pred_1.csv')

## Post-Processing


In [None]:
import pandas as pd
from collections import Counter

Load prediction lists for models, decide the predictions with majority vote. (If all three models predict differently, then use either one.) 

In [None]:
pred3 = pd.read_csv('./old_pred_3.csv').Class.to_list()
pred2 = pd.read_csv('./old_pred_2.csv').Class.to_list()
pred1 = pd.read_csv('./old_pred_1.csv').Class.to_list()

In [None]:
res = []
for a, b, c in zip(pred3, pred2, pred1):
    res.append(Counter([a, b, c]).most_common()[0][0])

In [None]:
new_res = post_process(res, window=2)

In [None]:
save_pred(res, 'pred.csv')

## Reference

- TA's Hw2 sample code [link](https://colab.research.google.com/github/ga642381/ML2021-Spring/blob/main/HW02/HW02-1.ipynb#scrollTo=emUd7uS7crTz)