In [32]:
import os
import errno
import torch
from torch import nn, optim
from os import path
import sys
import gzip
import urllib.request as request

import pandas as pd
import numpy as np
from torch.autograd import Variable

from sklearn.decomposition import PCA

In [2]:
# xunxo 
PREFIX_PATH = os.path.abspath(os.path.join(os.getcwd(), '..', 'resources'))
train_path = os.path.join(PREFIX_PATH, 'sp1s_aa_train.txt')
test_path = os.path.join(PREFIX_PATH, 'sp1s_aa_test.txt')
target_test_path = os.path.join(PREFIX_PATH, 'labels_data_set_iv.txt')

# Reading Data

In [3]:
all_train = np.loadtxt(train_path)
y_train, X_train = torch.from_numpy(all_train[:,0]).long(), torch.from_numpy(all_train[:,1:]).float()
y_test, X_test = torch.from_numpy(np.loadtxt(target_test_path)).long(), torch.from_numpy(np.loadtxt(test_path)).float()

In [4]:
def train(model, loss, optimizer, x, y):
    x = Variable(x, requires_grad=False)
    y = Variable(y, requires_grad=False)

    optimizer.zero_grad()
    fx = model.forward(x)
    output = loss.forward(fx, y)
    output.backward()
    optimizer.step()

    return output.data[0]

# Basic Analysis

In [30]:
print('% of target=1: {}%'.format(round(y_train.sum()*100 / len(y_train), 2)))

% of target=1: 49.68%


Data set is balanced. 

Let's see if by each feature the mean is about the same. If there is a feature with mean very different from mean for both targets `{0,1}` then this feature is very discriminant

In [None]:
pos0 = y

In [43]:
pca = PCA(n_components=len(X_train))
pca.fit_transform(X_train)
pca.explained_variance_ratio_[0:3].sum()

0.8532811953885572

# Linear Model

In [13]:
torch.manual_seed(123)

def linear_model(input_dim, output_dim):
    model = torch.nn.Sequential()
    model.add_module("linear",
                     torch.nn.Linear(input_dim, output_dim, bias=False))
    return model

n_examples, n_features = X_train.size()
n_classes = 2
model = linear_model(n_features, n_classes)
loss = torch.nn.CrossEntropyLoss(size_average=True)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
batch_size = 10


for i in range(100):
    cost = 0.
    num_batches = n_examples // batch_size
    for k in range(num_batches):
        start, end = k * batch_size, (k + 1) * batch_size
        cost += train(model, loss, optimizer, X_train[start:end], y_train[start:end])
    
    X_to_predict = Variable(X_test, requires_grad=False)
    predictions = model.forward(X_to_predict).data.numpy().argmax(axis=1)
    
    if(i % 9 == 0):
        print("Epoch %d, cost = %f, acc = %.2f%%"
              % (i, cost / num_batches, 100. * np.mean(predictions == y_test)))

Epoch 0, cost = 10789.646023, acc = 49.00%
Epoch 9, cost = 10449.838133, acc = 66.00%
Epoch 18, cost = 7158.353319, acc = 70.00%
Epoch 27, cost = 5023.423771, acc = 70.00%
Epoch 36, cost = 2278.812549, acc = 70.00%
Epoch 45, cost = 8602.033896, acc = 56.00%
Epoch 54, cost = 2123.439958, acc = 73.00%
Epoch 63, cost = 3782.842390, acc = 71.00%
Epoch 72, cost = 1582.885766, acc = 71.00%
Epoch 81, cost = 692.542867, acc = 73.00%
Epoch 90, cost = 4523.643245, acc = 69.00%
Epoch 99, cost = 541.777342, acc = 71.00%


# Linear model with bias

In [14]:
torch.manual_seed(123)

def linear_model(input_dim, output_dim):
    model = torch.nn.Sequential()
    model.add_module("linear",
                     torch.nn.Linear(input_dim, output_dim, bias=True))
    return model

n_examples, n_features = X_train.size()
n_classes = 2
model = linear_model(n_features, n_classes)
loss = torch.nn.CrossEntropyLoss(size_average=True)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
batch_size = 10


for i in range(100):
    cost = 0.
    num_batches = n_examples // batch_size
    for k in range(num_batches):
        start, end = k * batch_size, (k + 1) * batch_size
        cost += train(model, loss, optimizer, X_train[start:end], y_train[start:end])
    
    X_to_predict = Variable(X_test, requires_grad=False)
    predictions = model.forward(X_to_predict).data.numpy().argmax(axis=1)
    
    if(i % 9 == 0):
        print("Epoch %d, cost = %f, acc = %.2f%%"
              % (i, cost / num_batches, 100. * np.mean(predictions == y_test)))

Epoch 0, cost = 10790.132277, acc = 49.00%
Epoch 9, cost = 4044.321712, acc = 63.00%
Epoch 18, cost = 7549.645196, acc = 67.00%
Epoch 27, cost = 2149.728992, acc = 63.00%
Epoch 36, cost = 4044.727492, acc = 57.00%
Epoch 45, cost = 5866.386057, acc = 65.00%
Epoch 54, cost = 6233.332675, acc = 74.00%
Epoch 63, cost = 1143.681443, acc = 71.00%
Epoch 72, cost = 1107.908116, acc = 73.00%
Epoch 81, cost = 1321.056231, acc = 73.00%
Epoch 90, cost = 765.724606, acc = 71.00%
Epoch 99, cost = 1031.378899, acc = 73.00%


# Linear model with Dropout

In [None]:
torch.manual_seed(123)

def linear_model(input_dim, output_dim):
    model = torch.nn.Sequential()
    model.add_module("linear",
                     torch.nn.Linear(input_dim, output_dim, bias=True))
    return model

n_examples, n_features = X_train.size()
n_classes = 2
model = linear_model(n_features, n_classes)
loss = torch.nn.CrossEntropyLoss(size_average=True)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
batch_size = 10


for i in range(100):
    cost = 0.
    num_batches = n_examples // batch_size
    for k in range(num_batches):
        start, end = k * batch_size, (k + 1) * batch_size
        cost += train(model, loss, optimizer, X_train[start:end], y_train[start:end])
    
    X_to_predict = Variable(X_test, requires_grad=False)
    predictions = model.forward(X_to_predict).data.numpy().argmax(axis=1)
    
    if(i % 9 == 0):
        print("Epoch %d, cost = %f, acc = %.2f%%"
              % (i, cost / num_batches, 100. * np.mean(predictions == y_test)))

# Conclusion for now..