# Model selection - SMOTE version

In [1]:
# import some libraries
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import KFold

# Loading data

In [2]:
#load training and testing data

raw_train = pd.read_csv("uc_data_train.csv", encoding='UTF-8')
raw_train = raw_train.loc[:, ~raw_train.columns.str.contains('Unnamed')]
raw_test = pd.read_csv("uc_data_test.csv", encoding='UTF-8')
raw_test = raw_test.loc[:, ~raw_test.columns.str.contains('Unnamed')]

#check the number of customer and features
print(raw_train.shape)
print(raw_test.shape)

(80001, 14)
(19999, 13)


In [3]:
#print top 5 from raw_train data
print(raw_train.head())

   customer_id  size_womenswear  total_number_of_orders  return_rate  \
0            1               16                      28         0.12   
1            2               20                     118         0.61   
2            3                8                     125         0.04   
3            4               12                     109         0.42   
4            5               16                      37         0.26   

   first_order_channel socioeconomic_status  size_corsetry_cup  \
0                    1                    C                  7   
1                    2                    H                  3   
2                    2                    F                  3   
3                    1                    F                  5   
4                    1                    I                  4   

   size_corsetry_briefs  size_footware  days_since_first_order    brand  \
0                    16              7                    4964  Brand 1   
1                   

# Feature extraction

In [5]:
#select some columns as features
features = ['total_number_of_orders', 'return_rate', 'socioeconomic_desc',
           'size_corsetry_cup', 'size_corsetry_briefs', 'size_footware', 'days_since_first_order', 
            'brand', 'age_in_years']
categorical = ['socioeconomic_desc', 'brand']

#divide dataset into data and label
#training data
train_data = raw_train[features]
train_data[categorical] = train_data[categorical].apply(LabelEncoder().fit_transform)
#print(train_data.head())
train_label = raw_train.loc[:, ['size_womenswear']].values.ravel()
print(train_data.shape)
print(train_label.shape)

#testing data
test_data = raw_test[features]
test_data[categorical] = test_data[categorical].apply(LabelEncoder().fit_transform)
print(test_data.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


(80001, 9)
(80001,)
(19999, 9)


In [6]:
#scale all the features
train_data = StandardScaler().fit_transform(train_data.values)
test_data = StandardScaler().fit_transform(test_data.values)

In [7]:
#apply SMOTE
s_train_data, s_train_label = SMOTE(random_state=2).fit_sample(train_data, train_label)
print(s_train_data.shape)

(198600, 9)


In [8]:
#fit into classifier
models  =  []
models.append(('RF', RF()))
models.append(('LR', LogisticRegression(solver='lbfgs', multi_class='multinomial')))
models.append(('GradientBoost', GradientBoostingClassifier()))
models.append(('Bagging', BaggingClassifier()))
models.append(('MLP', MLPClassifier()))
scoring = 'f1_macro'

results = []
names = []
###measure accuracy with k-fold
print("Averaged macro f-measure on training data with Cross-validation:")
for name, model in models:

    #split training data into 10 fold, and calculate the score by accuracy 
    cv_results = model_selection.cross_val_score(model, s_train_data, s_train_label, cv = KFold(n_splits=10, 
                random_state = None), scoring = scoring)
    results.append(cv_results)
    names.append(name)

    #print the result, its mean and standard deviation
    msg = "%s: %.3f (%.3f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)


Averaged macro f-measure on training data with Cross-validation:


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


RF: 0.448 (0.191)


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


LR: 0.220 (0.153)


KeyboardInterrupt: 