## 2021 ML HW2

**Competition is on [Kaggle](https://www.kaggle.com/c/ml2021spring-hw2) <br/>**
**Use DNN solve the problem with ensemble, post-processing enhancement**

### Data Preprocessing

In [None]:
import numpy as np

print('Loading data ...')

# Your Data Directory
data_root='/data1/ML/timit_11/'

train = np.load(data_root + 'train_11.npy')
train_label = np.load(data_root + 'train_label_11.npy')
test = np.load(data_root + 'test_11.npy')

In [None]:
print('Size of training data: {}'.format(train.shape))
print('Size of testing data: {}'.format(test.shape))

In [None]:
import torch
from torch.utils.data import Dataset

class TIMITDataset(Dataset):
    def __init__(self, X, y=None):
        self.data = torch.from_numpy(X).float()
        if y is not None:
            y = y.astype(np.int)
            self.label = torch.LongTensor(y)
        else:
            self.label = None

    def __getitem__(self, idx):
        if self.label is not None:
            return self.data[idx], self.label[idx]
        else:
            return self.data[idx]

    def __len__(self):
        return len(self.data)

In [None]:
VAL_RATIO = 0 # Validation Set Ratio
    
percent = int(train.shape[0] * (1 - VAL_RATIO))
train_x, train_y, val_x, val_y = train[:percent], train_label[:percent], train[percent:], train_label[percent:]
print('Size of training set: {}'.format(train_x.shape))
print('Size of validation set: {}'.format(val_x.shape))

In [None]:
BATCH_SIZE = 256

from torch.utils.data import DataLoader

train_set = TIMITDataset(train_x, train_y)
val_set = TIMITDataset(val_x, val_y)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True) #only shuffle the training data
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
import gc

del train, train_label, train_x, train_y, val_x, val_y
gc.collect()

In [None]:
import torch
import torch.nn as nn

class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.layer1 = nn.Linear(429, 2048)
        self.layer2 = nn.Linear(2048, 2048)
        self.dropout = nn.Dropout(0.5)
        self.out = nn.Linear(2048, 39)
        self.batchNorm = nn.BatchNorm1d(2048)
        
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.layer1(x)
        x = self.batchNorm(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer2(x)
        x = self.batchNorm(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.layer2(x)
        x = self.batchNorm(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.layer2(x)
        x  = self.batchNorm(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.layer2(x)
        x = self.batchNorm(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.out(x)
        
        return x

In [None]:
# fix random seed
def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  
    np.random.seed(seed)  
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

#check device
def get_device():
  return 'cuda:0' if torch.cuda.is_available() else 'cpu'
    
# fix random seed for reproducibility
same_seeds(0)

In [None]:
num_epoch = 100               # number of training epoch
learning_rate = 0.00005       # learning rate
weight_decay = 0.0001
ensemble_num = 7

device = get_device()
print(f'DEVICE: {device}')

# the path where checkpoint saved
model_path = './model_{:d}.ckpt'

# create model, define a loss function, and optimizer
models = [Classifier().to(device) for i in range(ensemble_num)]

criterion = nn.CrossEntropyLoss()
optimizers = [torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) for model in models]

In [None]:
# function for getting the ensemble result (voting)  
def find_most_freq(l):
    s = {}
    count, label = 0, 0
    for i in l:
        s[i] = s.get(i, 0) + 1
        if s[i] > count:
            count = s[i]
            label = i
    
    return label

### Training

In [None]:
best_acc = 0.0
for epoch in range(num_epoch):
    train_acc = 0.0
    train_loss = 0.0
    val_acc = 0.0
    val_loss = 0.0
    
    for index, model in enumerate(models):

        # training
        model.train() # set the model to training mode
        for i, data in enumerate(train_loader):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            optimizers[index].zero_grad()
            outputs = model(inputs)
            batch_loss = criterion(outputs, labels)
            _, train_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
            batch_loss.backward()
            optimizers[index].step()
            batch_correct = (train_pred.cpu() == labels.cpu()).sum().item()
            train_acc += batch_correct
            train_loss += batch_loss.item()
            if i % 100 == 0:
                print('[{:03d}/{:03d} | {:d}] correct: {:d}/{:d}'.format(epoch + 1, num_epoch, index, batch_correct, len(labels)))
    
    # validation
    if len(val_set) > 0:
        for model in models:
            model.eval() # set the model to evaluation mode
        with torch.no_grad():
            for i, data in enumerate(val_loader):
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)
                voting_list = [[] for k in range(len(inputs))]
                for index, model in enumerate(models):
                    outputs = model(inputs)
                    _, val_pred = torch.max(outputs, 1)
                    for j, output in enumerate(val_pred):
                        voting_list[j].append(int(output))
                        
                val_pred = torch.LongTensor(list(map(find_most_freq, voting_list)))

                val_acc += (val_pred.cpu() == labels.cpu()).sum().item() # get the index of the class with the highest probability
            
            print('[{:03d}/{:03d}] Val Acc: {:3.6f} best Acc: {:3.6f}'.format(
                epoch + 1, num_epoch, val_acc/len(val_set), best_acc / len(val_set)
            ))

            # if the model improves, save a checkpoint at this epoch
            if val_acc > best_acc:
                best_acc = val_acc
                for model_i, model in enumerate(models):
                    torch.save(model.state_dict(), model_path.format(model_i))
                print('saving model with acc {:.3f}'.format(best_acc/len(val_set)))
    elif epoch % 5 == 0: # validate the train data per 5 epoches
        with torch.no_grad():
            for i, data in enumerate(train_loader):
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)
                voting_list = [[] for k in range(len(inputs))]
                for index, model in enumerate(models):
                    outputs = model(inputs)
                    _, val_pred = torch.max(outputs, 1)
                    for j, output in enumerate(val_pred):
                        voting_list[j].append(int(output))

                val_pred = torch.LongTensor(list(map(find_most_freq, voting_list)))

                val_acc += (val_pred.cpu() == labels.cpu()).sum().item() # get the index of the class with the highest probability

            print('[{:03d}/{:03d}] Val Acc: {:3.6f}'.format(
                epoch + 1, num_epoch, val_acc/len(train_set)
            ))

# if not validating, save the last epoch
if len(val_set) == 0:
    for model_i, model in enumerate(models):
        torch.save(model.state_dict(), model_path.format(model_i))
    print('saving model at last epoch')

### Test Data

In [None]:
# create testing dataset
test_set = TIMITDataset(test, None)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)

# create model and load weights from checkpoint
models = [Classifier().to(device) for i in range(ensemble_num)]
for model_i, model in enumerate(models):
    model.load_state_dict(torch.load(model_path.format(model_i)))

In [None]:
predict = []

for model in models: # set the model to evaluation mode
    model.eval() 
with torch.no_grad():
    for i, data in enumerate(test_loader):
        inputs = data
        inputs = inputs.to(device)
        voting_list = [[] for k in range(len(inputs))]
        for index, model in enumerate(models):
            outputs = model(inputs)

            _, val_pred = torch.max(outputs, 1)
            for j, output in enumerate(val_pred):
                voting_list[j].append(int(output))

        voting_res = torch.LongTensor(list(map(find_most_freq, voting_list)))

        for y in voting_res.cpu().numpy():
            predict.append(y)

In [None]:
with open('pred.csv', 'w') as f:
    f.write('Id,Class\n')
    for i, y in enumerate(predict):
        f.write('{},{}\n'.format(i, y))

### Post-processing

In [None]:
import csv

l = []
with open('pred.csv') as csv_file:
  rows = csv.reader(csv_file)
  line_count = 0
  for row in rows:
    l.append(row)

counter = 0
for i in range(1, len(l)-1):
  if l[i-1][1] == l[i+1][1] and l[i][1] != l[i-1][1]:
    l[i][1] = l[i-1][1]

with open('pred.csv', 'w') as csv_file:
  writer = csv.writer(csv_file)
  for i in l:
    writer.writerow(i)

## Reference

TA's code