In [7]:
from models import *
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, accuracy_score
from scoop import futures
import argparse
import os
import idx2numpy
from itertools import chain
import pickle


In [10]:
BASELINE_EPOCHS = 40
BASELINE_BATCH_SIZE = 10
BASELINE_MOMENTUM = 0
OUTER_FOLDS = 5
INNER_FOLDS = 5
NETWORK = [3136, 50, 50, 10, 10]
NUM_SAMPLES = 20000
seed = 1

In [8]:
def make_labels(n):
    while True:
        y = random.sample(range(10), n)
        if sum(y) > 10:
            return y
        
def make_data(n=20000):
    np.random.seed(1)
    random.seed(1)
    
    D = idx2numpy.convert_from_file('/home/kshitij/DeepSaDe/datasets/mnist/train-images-idx3-ubyte')
    labels = idx2numpy.convert_from_file('/home/kshitij/DeepSaDe/datasets/mnist/train-labels-idx1-ubyte')
    indices_per_label = {l: list(np.where(labels == l)[0]) for l in np.unique(labels)}
    min_labels_per_image = 4
    max_labels_per_image = 4
    X = []
    y = [np.zeros(10) for i in range(len(labels[:n]))]
    for i in range(len(labels[:n])):
        y_i = make_labels(random.randint(min_labels_per_image, max_labels_per_image))
        y[i][y_i] = 1
        X_i = [D[random.sample(indices_per_label[l], 1)][0].flatten() for l in y_i]
        X.append(list(chain(*X_i)))
        if i % 1000 == 0:
            print('generated {} instances'.format(i))
    X = np.array(X, dtype=np.float32)
    X = X/255
    y = np.array(y)
    return X, y
        

In [11]:
X, y = make_data(n=NUM_SAMPLES)
    
# setting seed
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
    
# number of folds for cross-validation
kf = KFold(n_splits=OUTER_FOLDS, shuffle=True, random_state=random.randint(0, 1000))
split_data = list(kf.split(X))

generated 0 instances
generated 1000 instances
generated 2000 instances
generated 3000 instances
generated 4000 instances
generated 5000 instances
generated 6000 instances
generated 7000 instances
generated 8000 instances
generated 9000 instances
generated 10000 instances
generated 11000 instances
generated 12000 instances
generated 13000 instances
generated 14000 instances
generated 15000 instances
generated 16000 instances
generated 17000 instances
generated 18000 instances
generated 19000 instances


In [58]:
model = torch.load('{}/deepsade_model_fold_{}.pt'.format(seed, 4))
model

FFNeuralNetTorch(
  (Layers): ModuleList(
    (0): Linear(in_features=3136, out_features=50, bias=True)
    (1): Linear(in_features=50, out_features=50, bias=True)
    (2): Linear(in_features=50, out_features=10, bias=True)
    (3): Linear(in_features=10, out_features=10, bias=True)
  )
)

In [110]:
import torch.nn.functional as F

def get_embedding(model, X, last = 1):
    out = X
    for i in range(len(model.Layers) - last):
        out = model.Layers[i](out)
        out = F.relu(out)
    return out.detach().numpy()

In [108]:
for train_index, test_index in split_data:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    sys.exit(0)

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [177]:
tsne_tranforms = {}
for l in [4, 3, 2, 1]:
    X_l = get_embedding(model, torch.tensor(X_test), last = l)
    X_embedded = TSNE(n_components=2, init='random', perplexity=25).fit_transform(X_l)
    X_embedded = pd.DataFrame(X_embedded)
    X_embedded.columns = ['dim1', 'dim2']
    tsne_tranforms[l] = X_embedded

In [178]:
import altair as alt
charts = []
for l in tsne_tranforms:
    c = alt.Chart(tsne_tranforms[l]).mark_point().encode(
        x='dim1',
        y='dim2'
    )
    charts.append(c)
    
big_chart(charts[0] | charts[1] | charts[2] | charts[3], fontsize=25)

In [None]:
y_pred = predict_multilabel(model, X_test)
y_pred_sum = [sum([i if v[i] == 1 else 0 for i in range(len(v))]) for v in y_pred]
y_pred_bucket = []
for v in y_pred_sum:
    if v < 10:
        y_pred_bucket.append('< 10')
    elif v < 20:
        y_pred_bucket.append('10 <= sum < 20')
    elif v < 30:
        y_pred_bucket.append('20 <= sum < 30')
    else:
        y_pred_bucket.append('30 <= sum')

In [181]:
X_embedded['label'] = y_pred_bucket

In [183]:
c = alt.Chart(X_embedded).mark_point().encode(
        x='dim1',
        y='dim2',
        color='label'
    )
c

In [158]:
model_baseline = torch.load('{}/baseline_model_fold_{}.pt'.format(seed, 4))
model_baseline

MNISTBaselineModel(
  (Layers): ModuleList(
    (0): Linear(in_features=3136, out_features=50, bias=True)
    (1): Linear(in_features=50, out_features=50, bias=True)
    (2): Linear(in_features=50, out_features=10, bias=True)
    (3): Linear(in_features=10, out_features=10, bias=True)
  )
)

In [164]:
tsne_tranforms = {}
for l in [4, 3, 2, 1]:
    X_l = get_embedding(model_baseline, torch.tensor(X_test), last = l)
    X_embedded = TSNE(n_components=2, init='random', perplexity=25).fit_transform(X_l)
    X_embedded = pd.DataFrame(X_embedded)
    X_embedded.columns = ['dim1', 'dim2']
    tsne_tranforms[l] = X_embedded

In [165]:
import altair as alt
charts = []
for l in tsne_tranforms:
    c = alt.Chart(tsne_tranforms[l]).mark_point().encode(
        x='dim1',
        y='dim2'
    )
    charts.append(c)
    
big_chart(charts[0] | charts[1] | charts[2] | charts[3], fontsize=25)

In [175]:
def predict_multilabel(model, X):
    X_torch = np.array(X)
    X_torch = X_torch.astype(np.float32)
    X_torch = torch.tensor(X_torch)
    score = model.forward(X_torch).detach().numpy()
    pred = []
    for p in score:
        pred.append([1 if j > 0 else 0 for j in p])
    return np.array(pred)



In [114]:
X_l = get_embedding(model, torch.tensor(X_test), last = 4)
X_l.shape

(4000, 3136)

In [102]:
from sklearn.manifold import TSNE

In [115]:
X_embedded = TSNE(n_components=2, init='random', perplexity=25).fit_transform(X_l)

In [116]:
X_embedded = pd.DataFrame(X_embedded)
X_embedded.columns = ['dim1', 'dim2']
X_embedded

Unnamed: 0,dim1,dim2
0,34.119358,32.100727
1,32.249310,44.908756
2,-44.126369,16.697044
3,39.508949,39.544830
4,-26.197268,-33.180717
...,...,...
3995,-1.715528,36.257961
3996,38.549072,-13.811404
3997,38.479565,26.258112
3998,-6.110721,58.840244


In [117]:
def get_sum_labels(y):
    return [sum([i if v[i] == 1 else 0 for i in range(len(v))]) for v in y]

y_test_sum = get_sum_labels(y_test)

In [118]:
y_test_sum

[17,
 14,
 25,
 16,
 24,
 20,
 19,
 20,
 18,
 24,
 20,
 19,
 25,
 16,
 22,
 22,
 18,
 16,
 20,
 24,
 19,
 22,
 16,
 25,
 19,
 21,
 22,
 16,
 20,
 21,
 24,
 20,
 26,
 25,
 18,
 23,
 13,
 17,
 26,
 18,
 22,
 14,
 21,
 22,
 24,
 15,
 15,
 14,
 15,
 22,
 20,
 30,
 17,
 11,
 21,
 24,
 21,
 15,
 25,
 16,
 17,
 16,
 13,
 14,
 20,
 16,
 21,
 25,
 13,
 21,
 22,
 16,
 19,
 17,
 19,
 20,
 12,
 21,
 17,
 17,
 12,
 26,
 12,
 18,
 18,
 11,
 15,
 19,
 18,
 24,
 27,
 18,
 17,
 14,
 12,
 12,
 20,
 12,
 18,
 18,
 21,
 17,
 16,
 12,
 21,
 18,
 17,
 18,
 17,
 15,
 21,
 16,
 17,
 20,
 15,
 25,
 16,
 14,
 15,
 18,
 20,
 23,
 24,
 25,
 14,
 23,
 15,
 14,
 23,
 20,
 17,
 22,
 20,
 16,
 20,
 12,
 21,
 23,
 11,
 15,
 17,
 17,
 20,
 28,
 12,
 11,
 19,
 17,
 13,
 24,
 15,
 23,
 17,
 15,
 21,
 15,
 16,
 26,
 19,
 19,
 14,
 17,
 14,
 22,
 13,
 17,
 25,
 19,
 15,
 24,
 14,
 21,
 17,
 26,
 19,
 15,
 13,
 18,
 23,
 14,
 24,
 22,
 16,
 14,
 14,
 25,
 27,
 13,
 23,
 17,
 19,
 12,
 21,
 20,
 21,
 19,
 16,
 22,
 21,
 26,


In [119]:
def big_chart(chart, fontsize = 20): 
    return chart.configure_axis(
        grid = True, 
    labelFontSize = fontsize,
    titleFontSize = fontsize
).configure_title(
    fontSize = fontsize
    ).configure_legend(
titleFontSize=fontsize,
labelFontSize=fontsize
).configure_view(
    strokeWidth=0
)

def small_chart(chart, fontsize=None): 
    return big_chart(chart.properties(width=150,
                             height=150
                            ), fontsize)

In [120]:
import altair as alt

c1 = alt.Chart(X_embedded).mark_point().encode(
    x='dim1',
    y='dim2'
)
big_chart(c1, fontsize=25)

In [121]:
c1 = alt.Chart(X_embedded).mark_point().encode(
    x='dim1',
    y='dim2'
)
big_chart(c1, fontsize=25)

In [122]:
c1 = alt.Chart(X_embedded).mark_point().encode(
    x='dim1',
    y='dim2'
)
big_chart(c1, fontsize=25)