In [1]:
import math
import random
import os
from pathlib import Path 
import time
from tqdm import tqdm

from sklearn.model_selection import train_test_split

import data_reader
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

### define model

In [2]:
class NNClassifier(nn.Module):
    def __init__(self, hidden_num=1, dropout_p=None,
                 input_dim=174, hidden_dim=128, class_num=2):
        super(NNClassifier, self).__init__()
        # loss
        self.loss = nn.CrossEntropyLoss()
        # hidden-hidden fcs
        self.hiddens = [nn.Linear(input_dim, hidden_dim) for _ in range(hidden_num-1)]
        # insert input-hidden fc
        self.hiddens.insert(0, nn.Linear(input_dim, hidden_dim))
        # dropout layers
        self.dropout_p = dropout_p
        if dropout_p is not None:
            self.drops = [nn.Dropout(p=dropout_p) for _ in range(hidden_num)]
        # output layer
        self.out = nn.Linear(hidden_dim, class_num)
        # dropout
        
        
    def forward(self, x):
        for i in range(len(self.hiddens)):
            x = F.relu(self.hiddens[i](x))
            if self.dropout_p is not None:
                x = self.drops[i](x)
        x = self.out(x)
        x = x.squeeze()
        val, idx = torch.max(x, dim=1)
        return x, idx
    
    def compute_loss(self, pred_vec, gold_vec):
        return self.loss(pred_vec, gold_vec)

### wrap-up

In [3]:
def make_a_try(X_train, X_test, Y_train, Y_test,
               hidden_num, dropout_p, lr, epoch_num,
               label_index, debug_mode=True):
    debug_report_seg = epoch_num // 10
    train_size, input_dim = X_train.shape
    model = NNClassifier(input_dim=input_dim, dropout_p=dropout_p)
    optimizer = optim.SGD(model.parameters(), lr=0.5)
    #optim.Adam(model.parameters())
    optimizer.zero_grad()
    model.train()
    start_train = time.time()
    for epoch in range(epoch_num):
        optimizer.zero_grad()
        inputs = torch.tensor(X_train).float()
        golds  = torch.tensor(Y_train[:,label_index]).long()
        pred_vals, pred_labels  = model(inputs)

        loss = model.compute_loss(pred_vals, golds)

        if debug_mode and epoch % debug_report_seg == 0:
            acc = golds.eq(pred_labels).sum().float() / train_size
            print("epoch {}, loss = {}, acc = {}".format(epoch, loss, acc))

        loss.backward()
        optimizer.step()

    acc = golds.eq(pred_labels).sum().float() / train_size
    print("training: loss = {}, acc = {}".format(loss, acc))

    model.eval()
    test_size, input_dim = X_test.shape
    inputs = torch.tensor(X_test).float()
    golds  = torch.tensor(Y_test[:,label_index]).long()
    pred_vals, pred_labels  = model(inputs)
    loss = model.compute_loss(pred_vals, golds)
    acc = golds.eq(pred_labels).sum().float() / test_size
    print("test: loss = {}, acc = {}".format(loss, acc))
    
    return model
    

### get data

In [4]:
import numpy as np

arr = data_reader.read()
arr = arr[(np.abs(arr[:,-2])>=0.3) & (np.abs(arr[:,-1])>=0.3)]
X = arr[:, :-2]
Y = arr[:, -2:]
X_train, X_test, Y_train, Y_test = \
train_test_split(X, Y, test_size=0.20)

Y_train = (Y_train >= 0).astype(int)
Y_test = (Y_test >= 0).astype(int)

print(len(X))

235


### 1 layer, no dropout

In [5]:
print("=== ch ===")
model_ch = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=1, dropout_p=None, lr=0.5, epoch_num=1000,
           label_index=0, debug_mode=False)
print("=== en ===")
model_en = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=1, dropout_p=None, lr=0.5, epoch_num=1000,
           label_index=1, debug_mode=False)

=== ch ===
training: loss = 0.33210134506225586, acc = 0.8776595592498779
test: loss = 0.7206650376319885, acc = 0.6808510422706604
=== en ===
training: loss = 0.31581735610961914, acc = 0.8723404407501221
test: loss = 0.5636088252067566, acc = 0.6382978558540344


### 3 layer, no dropout

In [6]:
print("=== ch ===")
model_ch = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=3, dropout_p=None, lr=0.5, epoch_num=1000,
           label_index=0, debug_mode=False)
print("=== en ===")
model_en = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=3, dropout_p=None, lr=0.5, epoch_num=1000,
           label_index=1, debug_mode=False)

=== ch ===
training: loss = 0.30260783433914185, acc = 0.9095744490623474
test: loss = 0.5958088040351868, acc = 0.6808510422706604
=== en ===
training: loss = 0.3044499456882477, acc = 0.914893627166748
test: loss = 0.7423840761184692, acc = 0.6170212626457214


### 1 layer, dropout = 0.1

In [7]:
print("=== ch ===")
model_ch = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=1, dropout_p=0.1, lr=0.5, epoch_num=1000,
           label_index=0, debug_mode=False)
print("=== en ===")
model_en = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=1, dropout_p=0.1, lr=0.5, epoch_num=1000,
           label_index=1, debug_mode=False)

=== ch ===
training: loss = 0.3738526403903961, acc = 0.8404255509376526
test: loss = 0.6228732466697693, acc = 0.6595744490623474
=== en ===
training: loss = 0.37097710371017456, acc = 0.8244680762290955
test: loss = 0.7804177403450012, acc = 0.5744680762290955


### 3 layer, dropout = 0.1

In [8]:
print("=== ch ===")
model_ch = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=3, dropout_p=0.1, lr=0.5, epoch_num=1000,
           label_index=0, debug_mode=False)
print("=== en ===")
model_en = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=3, dropout_p=0.1, lr=0.5, epoch_num=1000,
           label_index=1, debug_mode=False)

=== ch ===
training: loss = 0.3547871708869934, acc = 0.8617021441459656
test: loss = 0.746073842048645, acc = 0.5531914830207825
=== en ===
training: loss = 0.3603885769844055, acc = 0.8617021441459656
test: loss = 0.8338097929954529, acc = 0.6595744490623474


### 10 layer, dropout = 0.1

In [9]:
print("=== ch ===")
model_ch = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=10, dropout_p=0.1, lr=0.5, epoch_num=1000,
           label_index=0, debug_mode=False)
print("=== en ===")
model_en = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=10, dropout_p=0.1, lr=0.5, epoch_num=1000,
           label_index=1, debug_mode=False)

=== ch ===
training: loss = 0.37597331404685974, acc = 0.8404255509376526
test: loss = 0.8649161458015442, acc = 0.4893617033958435
=== en ===
training: loss = 0.433123379945755, acc = 0.792553186416626
test: loss = 0.6237226724624634, acc = 0.7021276354789734


----

## check "very different"

### get {id : (ch_label, en_label)}

In [10]:
# using 10 layer, dropout = 0.1

print("=== ch ===")
model_ch = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=10, dropout_p=0.1, lr=0.5, epoch_num=1000,
           label_index=0, debug_mode=False)
print("=== en ===")
model_en = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=10, dropout_p=0.1, lr=0.5, epoch_num=1000,
           label_index=1, debug_mode=False)

=== ch ===
training: loss = 0.37644511461257935, acc = 0.8563829660415649
test: loss = 0.5789474844932556, acc = 0.7021276354789734
=== en ===
training: loss = 0.40339311957359314, acc = 0.8457446694374084
test: loss = 0.649861216545105, acc = 0.7021276354789734


In [11]:
def get_pred(model, X_test):
    model.eval()
    test_size, input_dim = X_test.shape
    inputs = torch.tensor(X_test).float()
    pred_vals, pred_labels  = model(inputs)
    return pred_labels.detach().numpy().tolist()

In [17]:
arr = data_reader.read()
#arr = arr[(np.abs(arr[:,-2])>=0.3) & (np.abs(arr[:,-1])>=0.3)]
X = arr[:, :-2]
Y = arr[:, -2:]
X_train, X_test, Y_train, Y_test = \
train_test_split(X, Y, test_size=0.20)

Y_train = (Y_train >= 0).astype(int)
Y_test = (Y_test >= 0).astype(int)


pred_ch = get_pred(model_ch, X)
pred_en = get_pred(model_en, X)
very_diff_id_pred = set([i for i in range(len(pred_ch)) if pred_ch[i] != pred_en[i]])

### compare to "label_d_z"

In [18]:
import pandas as pd
df = pd.read_csv("../pol600withLabel.csv", encoding="utf-8", index_col=2)
label_d_z = df["label_d_z"]

In [19]:
very_diff_id_label = set([i for i in range(len(label_d_z)) if label_d_z.get(i) == 1])

In [20]:
p = len(very_diff_id_pred - very_diff_id_label) / len(very_diff_id_pred)
r = len(very_diff_id_label - very_diff_id_pred) / len(very_diff_id_label)
f = 2*p*r / (p+r)
print(p, r, f)

0.8333333333333334 0.625 0.7142857142857143


In [21]:
len(very_diff_id_pred)

198

------
## depricated, DO NOT USE

### train

In [174]:
train_size, input_dim = X_train.shape

one_layer_nn = NNClassifier(input_dim=input_dim)
optimizer = optim.SGD(one_layer_nn.parameters(), lr=0.5)
#optim.Adam(one_layer_nn.parameters())
optimizer.zero_grad()


#print("Training CH")

label_index = 0

one_layer_nn.train()
start_train = time.time()
for epoch in range(1000):
    
    optimizer.zero_grad()
    inputs = torch.tensor(X_train).float()
    golds  = torch.tensor(Y_train[:,0]).long()
    pred_vals, pred_labels  = one_layer_nn(inputs)
    
    loss = one_layer_nn.compute_loss(pred_vals, golds)

#     if epoch % 100 == 0:
#         acc = golds.eq(pred_labels).sum().float() / train_size
#         print("epoch {}, loss = {}, acc = {}".format(epoch, loss, acc))

    loss.backward()
    optimizer.step()

acc = golds.eq(pred_labels).sum().float() / train_size
print("after training: loss = {}, acc = {}".format(loss, acc))

#     if epoch == 900:
#         #print(golds)
#         #print(torch.cat((preds, preds.ge(0.0).float()), dim=1))
#         print(golds.eq(preds.ge(0.0).float()).sum().float() / train_size)
    
    

Training CH
after training: loss = 0.516764223575592, acc = 0.7416666746139526


### test

In [175]:
one_layer_nn.eval()

test_size, input_dim = X_test.shape

inputs = torch.tensor(X_test).float()
golds  = torch.tensor(Y_test[:,0]).long()
pred_vals, pred_labels  = one_layer_nn(inputs)

loss = one_layer_nn.compute_loss(pred_vals, golds)
acc = golds.eq(pred_labels).sum().float() / test_size

print(loss, acc)

tensor(0.6565, grad_fn=<NllLossBackward>) tensor(0.6083)


In [4]:
class DropOutNN(nn.Module):
    def __init__(self, input_dim=174, hidden_dim=128, output_dim=1):
        super(DropOutNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.drop = nn.Dropout(p=0.1)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.drop(x)
        x = self.fc2(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features        

In [2]:
class OneLayerNN(nn.Module):

    def __init__(self, input_dim=174, hidden_dim=128, output_dim=1):
        super(OneLayerNN, self).__init__()
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features