# 設定

In [2]:
import math
import numpy as np
import pandas as pd
from scipy import stats

import os
import csv

# progress bar
from tqdm import tqdm

import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

from tabulate import tabulate
import time

# for plotting learning curve
from torch.utils.tensorboard import SummaryWriter

# %reload_ext tensorboard
# %tensorboard --logdir=./runs/

from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'torch'

In [5]:
def same_seed(seed): 
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [6]:
def train_valid_split(data_set, valid_ratio, seed):
    '''Split provided training data into training set and validation set'''
    valid_set_size = int(valid_ratio * len(data_set)) 
    train_set_size = len(data_set) - valid_set_size
    train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
    return np.array(train_set), np.array(valid_set)

In [21]:
# gpu
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# configuration
config = {
    'seed': 0,
    'select_all': True,
    'valid_ratio': 0.2,
    'n_epochs': 1000,
    'batch_size': 256,
    # 'batch_size': 32, 
    'learning_rate': 1e-5,
    'early_stop': 400,
    'save_path': './models/model.ckpt'
}

# create directory of saving models
if not os.path.isdir('./models'):
    os.mkdir('./models') 

# set seed for reproducibility
same_seed(config['seed'])

# 資料

In [8]:
def select_feat(train_data, valid_data, select_all=True):
    '''Selects useful features to perform regression'''
    y_train, y_valid = train_data[:,-1], valid_data[:,-1]
    raw_x_train, raw_x_valid, raw_x_test = train_data[:,:-1], valid_data[:,:-1], test_data

    if select_all:
        feat_idx = list(range(raw_x_train.shape[1]))
    else:
        # TODO: Select suitable feature columns.
        feat_idx = [0,1,2,3,4] 
        
    return raw_x_train[:,feat_idx], raw_x_valid[:,feat_idx], raw_x_test[:,feat_idx], y_train, y_valid

In [9]:
selected_columns = [
    # yes/no
    # 'sex',
    'isReject',
    # 'hasOtherComAccount',

    # categorical
    # 'bid',
    # 'source',
    'occupation',
    # 'company',
    # 'eduLevel',

    # numerical-like
    'expInvestment', 
    'yrsInvestment', 
    'frqInvestment',
    'srcCapital',
    'quotaCredit',
    'incomeYear',
    'totalWealth',
    
    'quota_now'
]

# 根據一些條件刪掉一些row
data = pd.read_csv('./ooa_data.csv')[selected_columns].dropna()
data = data[data['quota_now']>0]
data = data[data['quota_now']<=1000000]
data = data[data['isReject']==0]
# data = data[data['occupation']==20]
data = data.drop('isReject', axis=1)
data['quota_now'] /= 1e4

# 除去outlier
data = data[(np.abs(stats.zscore(data['quota_now'].values)) < 2)]

In [11]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=config['valid_ratio'])

# TODO feature selection

In [12]:
class Quota_dataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.FloatTensor(x)
        self.y = torch.FloatTensor(y)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)

In [13]:
train_dataset = Quota_dataset(X_train, Y_train)
valid_dataset = Quota_dataset(X_valid, Y_valid)

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)

table = [
    ['train_data', X_train.shape],
    ['valid_data', X_valid.shape],
    ['num features', X_train.shape[1]]
]
print(tabulate(table, tablefmt="fancy_grid"))

╒══════════════╤═════════════╕
│ train_data   │ (102516, 8) │
├──────────────┼─────────────┤
│ valid_data   │ (25630, 8)  │
├──────────────┼─────────────┤
│ num features │ 8           │
╘══════════════╧═════════════╛


# 模型

In [16]:
class My_Model(nn.Module):
    def __init__(self, input_dim):
        super(My_Model, self).__init__()
        # TODO modify model's structure
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),

            nn.Linear(16, 128),
            nn.ReLU(),

            nn.Linear(128, 512),
            nn.ReLU(),

            nn.Linear(512, 128),
            nn.ReLU(),
            
            nn.Linear(128, 1)
        )

    def forward(self, x):
        x = self.layers(x)
        x = x.squeeze(1)
        return x

# 訓練

In [22]:
def trainer(train_loader, valid_loader, model, config, device):

    criterion = nn.MSELoss(reduction='mean')

    # TODO L2 regularization
    # TODO different optimizer
    # TODO weight decay
    
    optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=0.9) 

    now = time.strftime("%y%m%d-%H%M%S", time.localtime())
    writer = SummaryWriter(log_dir=f'./runs/{now}')

    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0

    train_pbar = tqdm(range(n_epochs), position=0, leave=True)
    
    for epoch in train_pbar:

        # ---------------------------------------------------------------------------- #
        #                              ONE EPOCH TRAINING                              #
        # ---------------------------------------------------------------------------- #
        
        model.train()
        loss_record = []
        n_train = 0
        for x, y in train_loader:
            optimizer.zero_grad()
            x, y = x.to(device), y.to(device)
            pred = model(x)
            loss = criterion(pred, y)
            loss.backward()
            optimizer.step()
            step += 1
            loss_record.append(loss.detach().item())
            n_train += len(y)
        mean_train_loss = sum(loss_record)/n_train
        
        # ---------------------------------------------------------------------------- #
        #                             ONE EPOCH VALIDATION                             #
        # ---------------------------------------------------------------------------- #
        
        model.eval()
        loss_record = []
        n_valid = 0
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                loss = criterion(pred, y)
            loss_record.append(loss.item())
            n_valid += len(y)
        mean_valid_loss = sum(loss_record)/n_valid

        # ---------------------------------------------------------------------------- #
        #                               ONE EPOCH SUMMARY                              #
        # ---------------------------------------------------------------------------- #

        train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
        train_pbar.set_postfix({
            'TL': mean_train_loss,
            'VL': mean_valid_loss,
            'BL': best_loss,
        })
        writer.add_scalars("Loss", {'train': mean_train_loss, 'valid': mean_valid_loss}, step)

        # ---------------------------------------------------------------------------- #
        #                               ONE EPOCH SAVING                               #
        # ---------------------------------------------------------------------------- #

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path'])
            early_stop_count = 0
        else: 
            early_stop_count += 1
        if early_stop_count >= config['early_stop']:
            print('Model is not improving, so we halt the training session.')
            return

In [23]:
model = My_Model(input_dim=X_train.shape[1]).to(device)

trainer(train_loader, valid_loader, model, config, device)

Epoch [970/1000]:  97%|█████████▋| 969/1000 [28:59<00:55,  1.80s/it, TL=4.97, VL=5.39, BL=5.23]

Model is not improving, so we halt the training session.





# 預測

In [None]:
def predict(test_loader, model, device):
    model.eval()
    preds = []
    for x in tqdm(test_loader):
        x = x.to(device)                        
        with torch.no_grad():                   
            pred = model(x)                     
            preds.append(pred.detach().cpu())   
    preds = torch.cat(preds, dim=0).numpy()  
    return preds

In [None]:
def save_pred(preds, file):
    ''' Save predictions to specified file '''
    with open(file, 'w') as fp:
        writer = csv.writer(fp)
        writer.writerow(['id', 'tested_positive'])
        for i, p in enumerate(preds):
            writer.writerow([i, p])

model = My_Model(input_dim=x_train.shape[1]).to(device)
model.load_state_dict(torch.load(config['save_path']))
preds = predict(test_loader, model, device) 
save_pred(preds, 'pred.csv')         

In [1]:
%reload_ext tensorboard
%tensorboard --logdir=./runs/