In [3]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
from collections import defaultdict
import operator

In [4]:
def get_x():
    maxlen = 65
    n_features = 6438
    seqs = []
    for i in range(1, 824):
        word_features = np.zeros([maxlen, n_features])
        with open('basenp_novi/'+str(i)+'.x') as file:
            for l in file:
                word, f, _ = l.split('\t')
                word, f = int(word)-1, int(f)
                word_features[word, f] = 1.
            word_features = word_features[:word+1, :]
        seqs.append(word_features)
    return seqs

def get_y():
    maxlen = 65
    n_features = 6438
    labels = []
    for i in range(1, 824):
        seq_labels = []
        with open('basenp_novi/'+str(i)+'.y') as file:
            for l in file:
                label = l[:-1]
                label = int(label)
                seq_labels.append(label)
        labels.append(np.array(seq_labels))
    return labels

In [5]:
def pad(x, y):
    maxlen = np.max([seq.shape[0] for seq in x])
    D = x[0].shape[1]
    padded_x = []
    padded_y = []
    seq_lens = []
    for seq, labels in list(zip(x, y)):
        seq_len = seq.shape[0]
        padded_seq = np.concatenate([seq, np.zeros((maxlen - seq_len, D))])[None, :, :]
        padded_labels = np.concatenate([labels, np.zeros((maxlen - seq_len))])[None, :]
        padded_x.append(padded_seq)
        padded_y.append(padded_labels)
        seq_lens.append(seq_len)
    return np.concatenate(padded_x), np.concatenate(padded_y), np.array(seq_lens)

def make_classification_data(x, y):
    x_new, y_new = [], []
    for seq, labels in list(zip(x, y)):
        x_new.append(seq)
        y_new.append(labels)
    x_new = np.concatenate(x_new)
    y_new = np.concatenate(y_new)
#     y_new[y_new == 1] = 0
#     y_new[y_new == 2] = 1
    return x_new, y_new

## Classification data

In [6]:
x = get_x()
y = get_y()
x_tr, y_tr = x[:500], y[:500]
x_te, y_te = x[500:], y[500:]
x_tr, y_tr = make_classification_data(x_tr, y_tr)
x_te, y_te = make_classification_data(x_te, y_te)

In [7]:
x_tr.shape

(11376, 6438)

In [8]:
from sklearn.linear_model import LogisticRegression

In [56]:
logreg = LogisticRegression(C=1., penalty='l1', )
logreg.fit(x_tr, y_tr)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [57]:
preds = logreg.predict(x_te)

In [58]:
np.sum(preds == y_te) / (y_te.size)

0.94586967675731148

In [99]:
np.save('data_class/x_tr', x_tr)
np.save('data_class/x_te', x_te)
np.save('data_class/y_tr', y_tr)
np.save('data_class/y_te', y_te)

## NN

In [26]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils

In [31]:
model = Sequential()
model.add(Dense(10, input_shape=(6438,)))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='rmsprop',
          loss='categorical_crossentropy',
          metrics=['accuracy'])

In [32]:
y_tr_oh = np_utils.to_categorical(y_tr)
y_te_oh = np_utils.to_categorical(y_te)

In [34]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 10)                64390     
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 33        
Total params: 64,423
Trainable params: 64,423
Non-trainable params: 0
_________________________________________________________________


In [35]:
model.fit(x_tr, y_tr_oh, batch_size = 10, validation_data=[x_te, y_te_oh])

Train on 11376 samples, validate on 7796 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: 

## Structured data

In [7]:
x = get_x()
y = get_y()
x_tr, y_tr = x[:500], y[:500]
x_te, y_te = x[500:], y[500:]

In [9]:
x_tr, y_tr, seq_lens = pad(x_tr, y_tr)

In [10]:
x_tr.shape

(500, 63, 6438)

In [None]:
x_te = x_tr[]