In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

In [40]:
predictor_variables = [x for x in train.columns if x not in ['connection_id','target','cat_4','cat_6','cat_17','cat_18']]
target = train['target']

In [12]:
train_data_in_float = train[predictor_variables].astype(float)
test_data_in_float = test[predictor_variables].astype(float)

In [13]:
scaler = StandardScaler()

In [14]:
train_std = scaler.fit_transform(train_data_in_float)
test_std = scaler.fit_transform(test_data_in_float)

In [15]:
from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=16)

In [16]:
sklearn_pca.fit(train_std)

PCA(copy=True, iterated_power='auto', n_components=16, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [19]:
Y_sklearn = sklearn_pca.transform(train_std)
test_sklearn = sklearn_pca.transform(test_std)

In [21]:
Y_sklearn.shape

(169307, 16)

In [22]:
test_sklearn.shape

(91166, 16)

In [37]:
import tensorflow as tf
import keras

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Convolution2D, Flatten, MaxPooling2D, Reshape, InputLayer, ZeroPadding2D

In [25]:
# Reshape data
train_x_temp = Y_sklearn.reshape(-1, 4, 4, 1)
test_x_temp = test_sklearn.reshape(-1, 4, 4, 1)

In [26]:
train_x_temp.shape

(169307, 4, 4, 1)

In [58]:
# define vars
input_reshape = (4, 4, 1)

conv_num_filters = 5
conv_filter_size = 2

pool_size = (2, 2)

hidden_num_units = 50
output_num_units = 3

epochs = 5
batch_size = 100

In [59]:
# Create Model
model = Sequential([
 InputLayer(input_shape=input_reshape),
        
 ZeroPadding2D(padding=(4, 4)),

 Convolution2D(25, 2, 2, activation='relu'),
 MaxPooling2D(pool_size=pool_size),

 Convolution2D(25, 2, 2, activation='relu'),
 MaxPooling2D(pool_size=pool_size),

 Convolution2D(25, 2, 2, activation='relu'),

 Flatten(),

 Dense(output_dim=hidden_num_units, activation='relu'),

 Dense(output_dim=output_num_units, input_dim=hidden_num_units, activation='softmax'),
])

In [60]:
# Compile Model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [61]:
trained_model_conv = model.fit(train_x_temp, train_y, nb_epoch=epochs, batch_size=batch_size, validation_split=0.2)

Train on 135445 samples, validate on 33862 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [47]:
train_y = keras.utils.np_utils.to_categorical(target.values)

In [48]:
train_y.shape

(169307, 3)

In [49]:
train_y

array([[ 0.,  0.,  1.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       ..., 
       [ 0.,  1.,  0.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.]])

In [13]:
col_names = train.columns[19:42].tolist()

In [19]:
for col in col_names:
    print (np.unique(train[col]))

[1 2 3]
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
 26 27 28 29 30 31 32 33 34 35 36 37 38 40 41 42 43 44 45 46 47 48 49 50 51
 52 53 54 55 56 57 58 59 60 61 62 63 65 69]
[ 1  2  3  4  5  6  7  8  9 10 11]
[0 1]
[0 1 3]
[0 2]
[ 0  1  2  3  4  5  6 10 13 14 18 19 20 21 22 24 28 30]
[0 1]
[0 1]
[  0   1   2   3   4   6   8  21 254 378 435]
[0 1]
[0 1 2]
[  0   1   2   4   5   6   9 289 421 480]
[ 0  1  2  3  4  5 13 14 22 25]
[0 1 2]
[0 1 2 3 4 5]
[0]
[0 1]
[0 1]
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 12

In [3]:
train_cont_col_names = train.columns[1:19].tolist()
test_cont_col_names = test.columns[1:19].tolist()

In [12]:
train_list = train['cont_1']
for x in train_cont_col_names:
    train_list = np.add(train_list, train[x])

In [15]:
train['total_cont'] = train_list

In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169307 entries, 0 to 169306
Data columns (total 44 columns):
connection_id    169307 non-null object
cont_1           169307 non-null int64
cont_2           169307 non-null int64
cont_3           169307 non-null int64
cont_4           169307 non-null float64
cont_5           169307 non-null float64
cont_6           169307 non-null float64
cont_7           169307 non-null float64
cont_8           169307 non-null float64
cont_9           169307 non-null float64
cont_10          169307 non-null float64
cont_11          169307 non-null float64
cont_12          169307 non-null float64
cont_13          169307 non-null float64
cont_14          169307 non-null float64
cont_15          169307 non-null float64
cont_16          169307 non-null float64
cont_17          169307 non-null float64
cont_18          169307 non-null float64
cat_1            169307 non-null int64
cat_2            169307 non-null int64
cat_3            169307 non-null int64


In [34]:
train_cont_col_names.append('connection_id')
train_cont_col_names.append('target')
train_cont_col_names.append('cat_4')
train_cont_col_names.append('cat_6')
train_cont_col_names.append('cat_11')
train_cont_col_names.append('cat_12')
train_cont_col_names.append('cat_15')
train_cont_col_names.append('cat_17')
train_cont_col_names.append('cat_18')

In [35]:
predictor_variables = [x for x in train.columns if x not in train_cont_col_names]
target = train['target']

In [41]:
X_train, X_test, y_train, y_test = train_test_split(train[predictor_variables], target, test_size=0.2, stratify=target,random_state=2017)

In [42]:
# Import LightGBM and sklearn LightGBM
import lightgbm as lgb
from lightgbm.sklearn import LGBMClassifier

In [43]:
lgbclassifier = LGBMClassifier(random_state=2017, n_jobs=4)

lgbclassifier.fit(X_train,y_train)
pred_lgb = lgbclassifier.predict(X_test)
acc_lgb = accuracy_score(y_test,pred_lgb)
print acc_lgb
# 0.780757190952

# lgbclassifier.fit(train[predictor_variables],target)
# pred3 = lgbclassifier.predict(test[predictor_variables])  # 0.64736

0.780727659323


In [45]:
# Import LightGBM and sklearn LightGBM
import lightgbm as lgb
from lightgbm.sklearn import LGBMClassifier

def gridSearchCV_lgb_clf(params):
    gsearch = GridSearchCV(estimator=LGBMClassifier(learning_rate=0.05,
                   n_estimators=100,
                   max_depth=10,
                   num_leaves=32,
                   max_bin=264,
                   subsample=0.6,
                   colsample_bytree=0.8,
                   random_state=2017), param_grid=params,
                       scoring='accuracy',
                       iid=False,
                       cv=5)
    
    gsearch.fit(train[predictor_variables], target)
    print gsearch.best_params_
    print gsearch.best_score_

In [None]:
param_test = {
    'n_estimators':range(10,101,10)
}
gridSearchCV_lgb_clf(param_test)

In [7]:
for col in train_cont_col_names:
   print "{} min: {} and max: {}".format(col, np.min(train[col]), np.max(train[col]))

cont_1 min: 0 and max: 42596
cont_2 min: 0 and max: 11396904
cont_3 min: 0 and max: 11730594
cont_4 min: 0.0 and max: 1.0
cont_5 min: 0.0 and max: 1.0
cont_6 min: 0.0 and max: 1.0
cont_7 min: 0.0 and max: 1.0
cont_8 min: 0.0 and max: 1.0
cont_9 min: 0.0 and max: 1.0
cont_10 min: 0.0 and max: 1.0
cont_11 min: 0.0 and max: 1.0
cont_12 min: 0.0 and max: 1.0
cont_13 min: 0.0 and max: 1.0
cont_14 min: 0.0 and max: 1.0
cont_15 min: 0.0 and max: 1.0
cont_16 min: 0.0 and max: 1.0
cont_17 min: 0.0 and max: 1.0
cont_18 min: 0.0 and max: 1.0


In [26]:
np.unique(train['cont_4'])

array([ 0.  ,  0.01,  0.02,  0.03,  0.04,  0.05,  0.06,  0.07,  0.08,
        0.09,  0.1 ,  0.11,  0.12,  0.13,  0.14,  0.15,  0.16,  0.17,
        0.18,  0.19,  0.2 ,  0.21,  0.22,  0.23,  0.24,  0.25,  0.26,
        0.27,  0.28,  0.31,  0.32,  0.33,  0.34,  0.35,  0.36,  0.38,
        0.39,  0.4 ,  0.44,  0.46,  0.5 ,  0.52,  0.53,  0.55,  0.56,
        0.58,  0.59,  0.61,  0.63,  0.64,  0.65,  0.66,  0.68,  0.7 ,
        0.71,  0.73,  0.74,  0.77,  0.78,  0.79,  0.8 ,  0.82,  0.83,
        0.85,  0.86,  0.87,  0.88,  0.93,  0.95,  0.96,  0.97,  0.98,
        0.99,  1.  ])

In [28]:
def parse_num(x):
    if ((x >= 0.0) & (x <=0.1)):
        return 1
    elif ((x > 0.1) & (x <= 0.2)):
        return 2
    elif ((x > 0.2) & (x <= 0.3)):
        return 3
    elif ((x > 0.3) & (x <= 0.4)):
        return 4
    elif ((x > 0.4) & (x <= 0.5)):
        return 5
    elif ((x > 0.5) & (x <= 0.6)):
        return 6
    elif ((x > 0.6) & (x <= 0.7)):
        return 7
    elif ((x > 0.7) & (x <= 0.8)):
        return 8
    elif ((x > 0.8) & (x <= 0.9)):
        return 9
    elif (x > 0.9):
        return 10

In [29]:
parsed_cont_4 = train['cont_4'].apply(parse_num)

In [31]:
np.unique(parsed_cont_4)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [35]:
cols_to_be_parsed = train.columns[4:19].tolist()

In [36]:
for cols in cols_to_be_parsed:
    train[cols] = train[cols].apply(parse_num)

In [37]:
train.head()

Unnamed: 0,connection_id,cont_1,cont_2,cont_3,cont_4,cont_5,cont_6,cont_7,cont_8,cont_9,...,cat_15,cat_16,cat_17,cat_18,cat_19,cat_20,cat_21,cat_22,cat_23,target
0,cxcon_1,0,1032,0,1,1,1,1,10,1,...,0,0,0,0,0,511,511,255,255,2
1,cxcon_4,0,520,0,1,1,1,1,10,1,...,0,0,0,0,0,511,511,255,255,0
2,cxcon_7,0,1032,0,1,1,1,1,10,1,...,0,0,0,0,0,511,511,255,255,0
3,cxcon_10,0,1032,0,1,1,1,1,10,1,...,0,0,0,0,0,511,511,255,255,0
4,cxcon_13,0,1032,0,1,1,1,1,10,1,...,0,0,0,0,0,511,511,255,255,2


In [38]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169307 entries, 0 to 169306
Data columns (total 43 columns):
connection_id    169307 non-null object
cont_1           169307 non-null int64
cont_2           169307 non-null int64
cont_3           169307 non-null int64
cont_4           169307 non-null int64
cont_5           169307 non-null int64
cont_6           169307 non-null int64
cont_7           169307 non-null int64
cont_8           169307 non-null int64
cont_9           169307 non-null int64
cont_10          169307 non-null int64
cont_11          169307 non-null int64
cont_12          169307 non-null int64
cont_13          169307 non-null int64
cont_14          169307 non-null int64
cont_15          169307 non-null int64
cont_16          169307 non-null int64
cont_17          169307 non-null int64
cont_18          169307 non-null int64
cat_1            169307 non-null int64
cat_2            169307 non-null int64
cat_3            169307 non-null int64
cat_4            169307 non-nu