In [None]:
import pickle
import numpy as np
import gc
import pandas as pd
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.regularizers import L1L2
import pickle
import numpy as np
import gc

One Time Processing

In [None]:
label_binarizer = LabelBinarizer()
count_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)

In [None]:
data = pd.read_pickle('/content/gdrive/MyDrive/Data/dataset/data_processed_lite.pkl')
data_cleaned = data['FEATURE'].values
label = data['LABEL'].values

In [None]:
data_cleaned = count_vectorizer.fit_transform(data_cleaned)
label = label_binarizer.fit_transform(label)

In [None]:
sparse.save_npz("datanpz.npz", data_cleaned)
pickle.dump(label, open("label_csr.pkl","wb"))

Training Starts Here

In [None]:
test_ratio=0.2
random_st=40
ep = 1
lr = 0.01
dropout = 0.5 #can be different for different layers
batch = 128

In [None]:
your_matrix_back = sparse.load_npz("datanpz.npz")
csr_dict = pickle.load(open("label_csr.pkl","rb"))

In [None]:
data_cleaned = your_matrix_back[0:5000].toarray()
label = csr_dict[0:5000]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(data_cleaned, label, test_size=test_ratio, random_state=random_st)

In [None]:
#Free RAM
del data_cleaned
del label
gc.collect()

In [None]:
output_class = Y_train.shape[-1]
input_size = X_train.shape[-1]

In [None]:
#Sample Keras NN Model
model = keras.Sequential([
    keras.Input(shape=(input_size, )),
    layers.Dense(4096, kernel_regularizer=L1L2(l1=0.0, l2=0.1), activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(dropout),
    layers.Dense(4096, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(dropout),
    layers.Dense(output_class, activation='softmax')
])

In [None]:
model.compile(loss=keras.losses.CategoricalCrossentropy(), metrics=['accuracy'], optimizer=keras.optimizers.Adam(learning_rate=lr))

Training in chunks, as entire data is too big to be loaded into the RAM

In [None]:
for j in range(ep):
  for i in range(16):
    s, e = i*3000, min(i*3000+3000, your_matrix_back.shape[0])
    data_cleaned = your_matrix_back[s:e].toarray()
    label = csr_dict[s:e]
    X_train, X_test, Y_train, Y_test = train_test_split(data_cleaned, label, test_size=test_ratio, random_state=random_st)
    model.fit(X_train, Y_train, batch_size=batch, epochs=ep, validation_data=(X_test, Y_test), shuffle = True)
    del data_cleaned
    del label
    gc.collect()

In [None]:
model.save("my_model")

In [None]:
model = keras.models.load_model("my_model")

In [None]:
test_data = pd.read_pickle('/content/gdrive/MyDrive/Data/dataset/data_test_pickle')
ID = pd.read_pickle('/content/gdrive/MyDrive/Data/dataset/ID_test_pickle')

Predicting classes of test data in chunks, for same above reason

In [None]:
res = []
for i in range(56):
    print(i)
    s, e = i*2000, min(test_data.shape[0], i*2000+2000)
    temp = test_data[s:e]
    temp = count_vectorizer.transform(temp.values)
    temp = temp.toarray()
    output = model.predict(temp)
    output = label_binarizer.inverse_transform(output)
    res.extend(output)
    del temp
    del output
    gc.collect()

In [None]:
submission = pd.DataFrame({'PRODUCT_ID':ID, 'BROWSE_NODE_ID':res})
submission = submission[['PRODUCT_ID', 'BROWSE_NODE_ID']]
submission.to_csv("submission.csv", index=False)