## Multilayer Perceptron Training in Chunks

### Using tf.keras

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.optimizers import SGD

model = Sequential()
model.add(Dense(2048, activation='relu', input_dim=5000))
# model.add(Dropout(0.5))
model.add(Dense(2048, activation='relu'))
# model.add(Dropout(0.5))
model.add(Dense(1000, activation='softmax'))

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

In [None]:
import math
import pandas as pd

size = 1281167
chunk_size = 60000
num_iters = math.ceil(size/chunk_size)

train_file = "f-selected/shuf_imagenet_200_vgg19_mrmr_5000_train.csv"
test_file = "f-selected/shuf_imagenet_200_vgg19_mrmr_5000_val.csv"

In [None]:
f = open("f-selected/shuf_imagenet_200_vgg19_mrmr_5000_train.csv", 'r')
header = f.readline()
f.close

label = header.strip('\n').split(',')[-1]
labels = [str(l) for l in range(1000)]

In [None]:
# TRAIN
from sklearn.model_selection import train_test_split

for epoch in range(10000): # epochs
    train = pd.read_csv(train_file, iterator=True, chunksize=chunk_size)

    for iter_ in range(num_iters):
        chunk = train.get_chunk()
        
        if (iter_%10) == 0:        
            train_chunk, val_chunk = train_test_split(chunk, test_size=0.1)
        else:
            train_chunk = chunk.copy()
        
        X_train = train_chunk.loc[:, chunk.columns != label].values
        label_col = train_chunk[label].apply(lambda x: str(int(x))).astype('category',categories=labels)
        y_train = pd.get_dummies(label_col).values
        
        model.train_on_batch(X_train, y_train)
        
        if (iter_%10) == 0:
            X_val = val_chunk.loc[:, chunk.columns != label].values        
            val_label_col = val_chunk[label].apply(lambda x: str(int(x))).astype('category',categories=labels)
            y_val = pd.get_dummies(val_label_col).values 
            
            acc = model.evaluate(X_val, y_val)
            print("Iter: {}\tEpoch: {}\tVal acc: {}".format(iter_+1, epoch+1, acc))

In [None]:
# EVALUATION

test = pd.read_csv(test_file)

test_X = test.loc[:, chunk.columns != label].values
test_label_col = test[label].apply(lambda x: str(int(x))).astype('category',categories=labels)
test_y = pd.get_dummies(test_label_col).values

In [None]:
model.evaluate(test_X, test_y)

### Using scikit-learn

In [None]:
import math
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier

size = 1281167
chunk_size = 50000
num_iters = math.ceil(size/chunk_size)

train_file = "f-selected/shuf_imagenet_200_vgg19_mrmr_5000_train.csv"
test_file = "f-selected/shuf_imagenet_200_vgg19_mrmr_5000_val.csv"

epochs = 10
hidden_units = 2048
num_iters = math.ceil(size/chunk_size)

labels = [str(l) for l in range(1000)]

mlp = MLPClassifier(hidden_layer_sizes=(hidden_units, hidden_units), learning_rate_init=0.005,
                    max_iter=1, shuffle=True, verbose=True, early_stopping=False)

In [None]:
for epoch in range(epochs):

    print("Epoch {}".format(epoch))
    train = pd.read_csv(train_file, iterator=True, chunksize=chunk_size)
    
    for _ in range(num_iters):
        
        chunk = train.get_chunk()
        X_train = chunk.loc[:, chunk.columns != label].values
        y_train = chunk[label].apply(lambda x: str(int(x))).values
        
        mlp = mlp.partial_fit(X_train, y_train, classes=labels)

In [None]:
# Predict
test = pd.read_csv(test_file)

test_X = test.loc[:, chunk.columns != label].values
test_y = test[label].apply(lambda x: str(int(x))).values

pred_test = mlp.predict(test_X)

In [None]:
# Accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test_y, pred_test)

print("Test accuracy {}".format(accuracy))

In [None]:
# Top-5 accuracy
n = 5
probs = mlp.predict_proba(test_X)
best_n = np.argsort(probs, axis=1)[:,-n:]

tp = 0
for best_predictions, y in zip(best_n, test_y):
    pred = [mlp.classes_[x] for x in best_predictions]
    if y in pred:
        tp += 1
        
top5_accuracty = float(tp / len(test_y))

print("Test Top-5 accuracy {}".format(top5_accuracty))

In [None]:
# Model dump
from joblib import dump
dump(mlp, 'sklearn-mlp-v1.joblib')

In [None]:
# Model load
from joblib import load
mlp = load('sklearn-mlp-v1.joblib')