In [8]:
import math
import random
import os
from pathlib import Path 
import time
from tqdm import tqdm

from sklearn.model_selection import train_test_split

import data_reader
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

### define model

In [9]:
class NNClassifier(nn.Module):
    def __init__(self, hidden_num=1, dropout_p=None,
                 input_dim=174, hidden_dim=128, class_num=2):
        super(NNClassifier, self).__init__()
        # loss
        self.loss = nn.CrossEntropyLoss()
        # hidden-hidden fcs
        self.hiddens = [nn.Linear(input_dim, hidden_dim) for _ in range(hidden_num-1)]
        # insert input-hidden fc
        self.hiddens.insert(0, nn.Linear(input_dim, hidden_dim))
        # dropout layers
        self.dropout_p = dropout_p
        if dropout_p is not None:
            self.drops = [nn.Dropout(p=dropout_p) for _ in range(hidden_num)]
        # output layer
        self.out = nn.Linear(hidden_dim, class_num)
        # dropout
        
        
    def forward(self, x):
        for i in range(len(self.hiddens)):
            x = F.relu(self.hiddens[i](x))
            if self.dropout_p is not None:
                x = self.drops[i](x)
        x = self.out(x)
        x = x.squeeze()
        val, idx = torch.max(x, dim=1)
        return x, idx
    
    def compute_loss(self, pred_vec, gold_vec):
        return self.loss(pred_vec, gold_vec)

### wrap-up

In [10]:
def make_a_try(X_train, X_test, Y_train, Y_test,
               hidden_num, dropout_p, lr, epoch_num,
               label_index, debug_mode=True):
    debug_report_seg = epoch_num // 10
    train_size, input_dim = X_train.shape
    model = NNClassifier(input_dim=input_dim, dropout_p=dropout_p)
    optimizer = optim.SGD(model.parameters(), lr=0.5)
    #optim.Adam(model.parameters())
    optimizer.zero_grad()
    model.train()
    start_train = time.time()
    for epoch in range(epoch_num):
        optimizer.zero_grad()
        inputs = torch.tensor(X_train).float()
        golds  = torch.tensor(Y_train[:,label_index]).long()
        pred_vals, pred_labels  = model(inputs)
        
        if debug_mode:
            print(golds)
            debug_mode=False

        loss = model.compute_loss(pred_vals, golds)

        if debug_mode and epoch % debug_report_seg == 0:
            acc = golds.eq(pred_labels).sum().float() / train_size
            print("epoch {}, loss = {}, acc = {}".format(epoch, loss, acc))

        loss.backward()
        optimizer.step()

    acc = golds.eq(pred_labels).sum().float() / train_size
    print("training: loss = {}, acc = {}".format(loss, acc))

    model.eval()
    test_size, input_dim = X_test.shape
    inputs = torch.tensor(X_test).float()
    golds  = torch.tensor(Y_test[:,label_index]).long()
    pred_vals, pred_labels  = model(inputs)
    loss = model.compute_loss(pred_vals, golds)
    acc = golds.eq(pred_labels).sum().float() / test_size
    print("test: loss = {}, acc = {}".format(loss, acc))
    
    return model
    

### get data

In [11]:
arr = data_reader.read()
X = arr[:, :-2]
Y = arr[:, -2:]
X_train, X_test, Y_train, Y_test = \
train_test_split(X, Y, test_size=0.20)

Y_train = (Y_train >= 0).astype(int)
Y_test = (Y_test >= 0).astype(int)

### 1 layer, no dropout

In [12]:
print("=== ch ===")
model_ch = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=1, dropout_p=None, lr=0.5, epoch_num=1000,
           label_index=0, debug_mode=True)
print("=== en ===")
model_en = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=1, dropout_p=None, lr=0.5, epoch_num=1000,
           label_index=1, debug_mode=False)

=== ch ===
tensor([1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,
        1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
        0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
        0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
        1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
        0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1,
        0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
        0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
        1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
        0, 0, 1, 1, 1, 1, 0, 

### 3 layer, no dropout

In [6]:
print("=== ch ===")
model_ch = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=3, dropout_p=None, lr=0.5, epoch_num=1000,
           label_index=0, debug_mode=False)
print("=== en ===")
model_en = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=3, dropout_p=None, lr=0.5, epoch_num=1000,
           label_index=1, debug_mode=False)

=== ch ===
training: loss = 0.48813948035240173, acc = 0.7583333253860474
test: loss = 0.6759177446365356, acc = 0.6333333253860474
=== en ===
training: loss = 0.5077177882194519, acc = 0.7458333373069763
test: loss = 0.7061546444892883, acc = 0.574999988079071


### 1 layer, dropout = 0.1

In [7]:
print("=== ch ===")
model_ch = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=1, dropout_p=0.1, lr=0.5, epoch_num=1000,
           label_index=0, debug_mode=False)
print("=== en ===")
model_en = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=1, dropout_p=0.1, lr=0.5, epoch_num=1000,
           label_index=1, debug_mode=False)

=== ch ===
training: loss = 0.516624927520752, acc = 0.7541666626930237
test: loss = 0.6964011788368225, acc = 0.6416666507720947
=== en ===
training: loss = 0.5272605419158936, acc = 0.7354166507720947
test: loss = 0.7110181450843811, acc = 0.5916666388511658


### 3 layer, dropout = 0.1

In [8]:
print("=== ch ===")
model_ch = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=3, dropout_p=0.1, lr=0.5, epoch_num=1000,
           label_index=0, debug_mode=False)
print("=== en ===")
model_en = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=3, dropout_p=0.1, lr=0.5, epoch_num=1000,
           label_index=1, debug_mode=False)

=== ch ===
training: loss = 0.5135113596916199, acc = 0.7645833492279053
test: loss = 0.7103562951087952, acc = 0.6499999761581421
=== en ===
training: loss = 0.5278018712997437, acc = 0.7437499761581421
test: loss = 0.6936649680137634, acc = 0.5833333134651184


### 10 layer, dropout = 0.1

In [9]:
print("=== ch ===")
model_ch = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=10, dropout_p=0.1, lr=0.5, epoch_num=1000,
           label_index=0, debug_mode=False)
print("=== en ===")
model_en = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=10, dropout_p=0.1, lr=0.5, epoch_num=1000,
           label_index=1, debug_mode=False)

=== ch ===
training: loss = 0.5167195200920105, acc = 0.737500011920929
test: loss = 0.6822354793548584, acc = 0.6166666746139526
=== en ===
training: loss = 0.5232189297676086, acc = 0.7583333253860474
test: loss = 0.7480103373527527, acc = 0.6083333492279053


----

## check "very different"

### get {id : (ch_label, en_label)}

In [10]:
# using 1 layer, dropout = 0.1

print("=== training ch ===")
model_ch = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=1, dropout_p=0.1, lr=0.5, epoch_num=1000,
           label_index=0, debug_mode=False)
print("=== training en ===")
model_en = make_a_try(X_train, X_test, Y_train, Y_test,
           hidden_num=1, dropout_p=0.1, lr=0.5, epoch_num=1000,
           label_index=1, debug_mode=False)

=== training ch ===
training: loss = 0.5242159962654114, acc = 0.7333333492279053
test: loss = 0.7151191234588623, acc = 0.6166666746139526
=== training en ===
training: loss = 0.5276638865470886, acc = 0.7541666626930237
test: loss = 0.6732989549636841, acc = 0.6083333492279053


In [11]:
def get_pred(model, X_test):
    model.eval()
    test_size, input_dim = X_test.shape
    inputs = torch.tensor(X_test).float()
    pred_vals, pred_labels  = model(inputs)
    return pred_labels.detach().numpy().tolist()

In [12]:
pred_ch = get_pred(model_ch, X)
pred_en = get_pred(model_en, X)
very_diff_id_pred = set([i for i in range(len(pred_ch)) if pred_ch[i] != pred_en[i]])

### compare to "label_d_z"

In [13]:
import pandas as pd
df = pd.read_csv("../pol600withLabel.csv", encoding="utf-8", index_col=2)
label_d_z = df["label_d_z"]

In [14]:
very_diff_id_label = set([i for i in range(len(label_d_z)) if label_d_z.get(i) == 1])

In [15]:
p = len(very_diff_id_pred - very_diff_id_label) / len(very_diff_id_pred)
r = len(very_diff_id_label - very_diff_id_pred) / len(very_diff_id_label)
f = 2*p*r / (p+r)
print(p, r, f)

0.8312236286919831 0.5454545454545454 0.6586792978545556


In [19]:
len(very_diff_id_pred)

237

------
## depricated, DO NOT USE

### train

In [16]:
train_size, input_dim = X_train.shape

one_layer_nn = NNClassifier(input_dim=input_dim)
optimizer = optim.SGD(one_layer_nn.parameters(), lr=0.5)
#optim.Adam(one_layer_nn.parameters())
optimizer.zero_grad()


#print("Training CH")

label_index = 0

one_layer_nn.train()
start_train = time.time()
for epoch in range(1000):
    
    optimizer.zero_grad()
    inputs = torch.tensor(X_train).float()
    golds  = torch.tensor(Y_train[:,0]).long()
    pred_vals, pred_labels  = one_layer_nn(inputs)
    
    loss = one_layer_nn.compute_loss(pred_vals, golds)

#     if epoch % 100 == 0:
#         acc = golds.eq(pred_labels).sum().float() / train_size
#         print("epoch {}, loss = {}, acc = {}".format(epoch, loss, acc))

    loss.backward()
    optimizer.step()

acc = golds.eq(pred_labels).sum().float() / train_size
print("after training: loss = {}, acc = {}".format(loss, acc))

#     if epoch == 900:
#         #print(golds)
#         #print(torch.cat((preds, preds.ge(0.0).float()), dim=1))
#         print(golds.eq(preds.ge(0.0).float()).sum().float() / train_size)
    
    

after training: loss = 0.5002903342247009, acc = 0.75


### test

In [175]:
one_layer_nn.eval()

test_size, input_dim = X_test.shape

inputs = torch.tensor(X_test).float()
golds  = torch.tensor(Y_test[:,0]).long()
pred_vals, pred_labels  = one_layer_nn(inputs)

loss = one_layer_nn.compute_loss(pred_vals, golds)
acc = golds.eq(pred_labels).sum().float() / test_size

print(loss, acc)

tensor(0.6565, grad_fn=<NllLossBackward>) tensor(0.6083)


In [4]:
class DropOutNN(nn.Module):
    def __init__(self, input_dim=174, hidden_dim=128, output_dim=1):
        super(DropOutNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.drop = nn.Dropout(p=0.1)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.drop(x)
        x = self.fc2(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features        

In [2]:
class OneLayerNN(nn.Module):

    def __init__(self, input_dim=174, hidden_dim=128, output_dim=1):
        super(OneLayerNN, self).__init__()
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features