In [6]:
%matplotlib inline

import pandas as pd
import numpy as np
from scipy import stats, integrate
import matplotlib.pyplot as plt
import seaborn as sns

import keras as ks
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping

from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler

from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

import xgboost as xgb
import lightgbm as lgb
from catboost import Pool, CatBoostClassifier, CatBoost

from pandas_summary import DataFrameSummary

from tqdm import tqdm

In [7]:
def load_arr(name, header=None):
    nm = 'input/' + name + '.csv'
    arr = pd.read_csv(nm, header=header).values
    print(f'Loaded "{nm}" with shape: {arr.shape}')
    return arr

y = load_arr('y')
    
merged_with_cat_as_dummy = load_arr('merged_with_cat_as_dummy')

merged_with_cat_as_int = load_arr('merged_with_cat_as_int')

merged_with_cat_as_str = load_arr('merged_with_cat_as_str')

latent_features = load_arr('ED.layer_6', header='infer')

submission = pd.DataFrame(columns=['item_id', 'category_class'])
submission['item_id'] = pd.read_csv('input/test.csv')['item_id']
print('submission.shape:', submission.shape)


Loaded "input/y.csv" with shape: (700, 1)
Loaded "input/merged_with_cat_as_dummy.csv" with shape: (1000, 219)
Loaded "input/merged_with_cat_as_int.csv" with shape: (1000, 9)
Loaded "input/merged_with_cat_as_str.csv" with shape: (1000, 9)
Loaded "input/ED.layer_6.csv" with shape: (1000, 32)
submission.shape: (300, 2)


In [8]:
def create_submission_file(pred_probs, subm_file_prefix):
    y_pred_classes = []
    for pred_prob in pred_probs:
        y_pred_classes.append(np.argmax(pred_prob))

    submission['category_class'] = y_pred_classes
    print(X.shape, submission.shape)
    print(submission.head())
    
    file_name = subm_file_prefix + '.submission.csv'

    submission.to_csv(file_name, index=False)
    print(f'Created "{file_name}"')

# The best Boosting model 
We will use catboost.


In [9]:
X = merged_with_cat_as_int[y.shape[0]:]

submission['category_class'] = None
print('X.shape:', X.shape, 'submission.shape:', submission.shape)
print(submission.head())

X.shape: (300, 9) submission.shape: (300, 2)
   item_id category_class
0     6000           None
1     5532           None
2     6797           None
3     3325           None
4     5447           None


In [10]:
cat_col_idxs = [4, 5, 6, 7, 8]

num_ensembles = 5
pred_probs = []
for i in range(num_ensembles):
    model = CatBoostClassifier()
    model.load_model(f'models/catboost_{i}')

    if i == 0:
        pred_probs = model.predict_proba(X)
    else:
        pred_probs += model.predict_proba(X)
    print(i, end=' ')
pred_probs /= num_ensembles
    
create_submission_file(pred_probs, 'catboost')

0 1 2 3 4 (300, 9) (300, 2)
   item_id  category_class
0     6000               0
1     5532               0
2     6797               3
3     3325               1
4     5447               1
Created "catboost.submission.csv"


## Adding the Latent Features from the Autoencoder NN

In [11]:
print(merged_with_cat_as_int[y.shape[0]:].shape, latent_features[y.shape[0]:].shape)
X = np.concatenate([merged_with_cat_as_int[y.shape[0]:], latent_features[y.shape[0]:]], axis=1)

submission['category_class'] = None
print('X.shape:', X.shape, 'submission.shape:', submission.shape)
print(submission.head())

(300, 9) (300, 32)
X.shape: (300, 41) submission.shape: (300, 2)
   item_id category_class
0     6000           None
1     5532           None
2     6797           None
3     3325           None
4     5447           None


In [12]:
cat_col_idxs = [4, 5, 6, 7, 8]

num_ensembles = 5
pred_probs = []
for i in range(num_ensembles):
    model = CatBoostClassifier()
    model.load_model(f'models/catboost_{i}.latent_features')

    if i == 0:
        pred_probs = model.predict_proba(X)
    else:
        pred_probs += model.predict_proba(X)
    print(i, end=' ')
pred_probs /= num_ensembles

create_submission_file(pred_probs, 'catboost.latent_features')

0 1 2 3 4 (300, 41) (300, 2)
   item_id  category_class
0     6000               0
1     5532               0
2     6797               3
3     3325               1
4     5447               2
Created "catboost.latent_features.submission.csv"


# The NN Model

In [13]:
X = merged_with_cat_as_dummy[y.shape[0]:]

submission['category_class'] = None
print('X.shape:', X.shape, 'submission.shape:', submission.shape)
print(submission.head())

X.shape: (300, 219) submission.shape: (300, 2)
   item_id category_class
0     6000           None
1     5532           None
2     6797           None
3     3325           None
4     5447           None


In [14]:
model = ks.models.load_model('models/keras.model')
pred_probs = model.predict(X)

create_submission_file(pred_probs, 'keras')

(300, 219) (300, 2)
   item_id  category_class
0     6000               1
1     5532               0
2     6797               3
3     3325               1
4     5447               1
Created "keras.submission.csv"


## The NN Model with the Latent Features from the Autoencoder NN

In [15]:
X = np.concatenate([merged_with_cat_as_dummy[y.shape[0]:], latent_features[y.shape[0]:]], axis=1)

submission['category_class'] = None
print('X.shape:', X.shape, 'submission.shape:', submission.shape)
print(submission.head())

X.shape: (300, 251) submission.shape: (300, 2)
   item_id category_class
0     6000           None
1     5532           None
2     6797           None
3     3325           None
4     5447           None


In [16]:
model = ks.models.load_model('models/keras.latent_features.model')
pred_probs = model.predict(X)

create_submission_file(pred_probs, 'keras.latent_features')

(300, 251) (300, 2)
   item_id  category_class
0     6000               1
1     5532               0
2     6797               3
3     3325               1
4     5447               1
Created "keras.latent_features.submission.csv"
