# 1. Data Cleaning

In [None]:
import pandas as pd
import numpy as np
np.random.seed(123)

## Load Data

In [None]:
df = pd.read_csv('Midterm Exam/data/train.csv')
df.head()

Unnamed: 0,id,target,car_01_cat,car_02_cat,car_03_cat,car_04_cat,car_05,car_06,car_07_cat,car_08_cat,...,idx_14,idx_15_bin,idx_16_bin,idx_17,idx_18,idx_19,idx_20,loc_01,loc_02,loc_03
0,C000000,0,4,-1,0,12,3.316625,0.37081,1,1,...,7,0,0,0,3,7,1,0.6,0.7,0.878564
1,C000001,0,0,1,0,39,2.44949,0.374433,1,1,...,11,0,1,0,6,8,1,0.6,0.6,0.981708
2,C000002,0,0,-1,2,100,3.464102,0.423084,1,1,...,11,0,0,0,1,8,3,0.2,0.9,0.495606
3,C000003,0,9,1,0,93,3.741657,0.316228,0,0,...,7,0,0,3,3,8,4,1.8,0.9,2.066549
4,C000004,0,0,0,1,104,3.741657,0.430116,1,0,...,6,0,0,1,1,7,4,0.0,0.4,0.740355


## Delete Meaningless Columns

In [None]:
df = df.drop('id', axis=1)
df.head()

Unnamed: 0,target,car_01_cat,car_02_cat,car_03_cat,car_04_cat,car_05,car_06,car_07_cat,car_08_cat,car_09,...,idx_14,idx_15_bin,idx_16_bin,idx_17,idx_18,idx_19,idx_20,loc_01,loc_02,loc_03
0,0,4,-1,0,12,3.316625,0.37081,1,1,0.754792,...,7,0,0,0,3,7,1,0.6,0.7,0.878564
1,0,0,1,0,39,2.44949,0.374433,1,1,0.61478,...,11,0,1,0,6,8,1,0.6,0.6,0.981708
2,0,0,-1,2,100,3.464102,0.423084,1,1,0.815665,...,11,0,0,0,1,8,3,0.2,0.9,0.495606
3,0,9,1,0,93,3.741657,0.316228,0,0,1.068114,...,7,0,0,3,3,8,4,1.8,0.9,2.066549
4,0,0,0,1,104,3.741657,0.430116,1,0,1.509768,...,6,0,0,1,1,7,4,0.0,0.4,0.740355


## Split into INPUT and TARGET variables

In [None]:
df_target = df['target']
df_input = df.drop('target', axis=1)

## Split into TRAIN and TEST sets

In [None]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(df_input, df_target,
                                                                     test_size=0.3,
                                                                     stratify=df_target)

## Data Regularization: Min-Max Scaler

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(train_input)

train_input = scaler.transform(train_input)
test_input = scaler.transform(test_input)

# 2. Modeling

## Model Definition

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.metrics import AUC

model = Sequential()

model.add(Dense(8, activation='relu', input_shape=(train_input.shape[1], )))
model.add(Dense(4, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc', AUC()])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 8)                 464       
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 36        
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 5         
Total params: 505
Trainable params: 505
Non-trainable params: 0
_________________________________________________________________


## Model Fitting

In [None]:
model.fit(train_input, train_target, epochs=20, batch_size=32,
         validation_data=(test_input, test_target))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x2b4da1fadc0>

# 3. Model Evaluations

## Test Sample Loading and Evaluating

In [None]:
df_test = pd.read_csv('Midterm Exam/data/test_sample.csv')

df_test = df_test.drop('id', axis=1)

df_test_target = df_test['target']
df_test_input = df_test.drop('target', axis=1)
df_test_input.head()

Unnamed: 0,car_01_cat,car_02_cat,car_03_cat,car_04_cat,car_05,car_06,car_07_cat,car_08_cat,car_09,car_10,...,idx_14,idx_15_bin,idx_16_bin,idx_17,idx_18,idx_19,idx_20,loc_01,loc_02,loc_03
0,0,1,0,22,3.464102,0.310322,1,1,0.776608,3,...,11,0,0,1,1,9,3,0.3,0.7,0.917878
1,1,0,0,31,3.162278,0.385487,1,1,0.773609,3,...,9,0,1,0,3,4,1,1.4,0.8,1.387894
2,11,-1,0,7,2.236068,0.412916,1,1,0.683271,3,...,11,0,0,2,2,12,5,0.2,0.6,0.562917
3,0,-1,0,22,3.605551,0.374566,1,1,0.783481,3,...,10,0,0,0,2,6,4,0.1,0.9,0.406202
4,3,1,8,104,3.464102,0.539444,1,1,1.280802,2,...,9,0,0,3,2,7,3,0.0,0.3,0.784219


In [None]:
df_test_input = scaler.transform(df_test_input)

In [None]:
pd.DataFrame(df_test_input).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,35700.0,0.387937,0.323792,0.0,0.058824,0.411765,0.647059,1.0
1,35700.0,0.422801,0.422409,0.0,0.0,0.5,1.0,1.0
2,35700.0,0.081967,0.242369,0.0,0.0,0.0,0.0,1.0
3,35700.0,0.596229,0.319499,0.0,0.300971,0.621359,0.893204,1.0
4,35700.0,0.819094,0.195691,0.0,0.755929,0.886405,0.963624,1.0
5,35700.0,0.779737,0.218867,0.0,0.814238,0.836461,0.853852,0.97263
6,35700.0,0.91556,0.187325,0.5,1.0,1.0,1.0,1.0
7,35700.0,0.830644,0.375071,0.0,1.0,1.0,1.0,1.0
8,35700.0,0.162352,0.065099,0.023456,0.121113,0.148577,0.188771,0.848599
9,35700.0,0.837374,0.207699,0.25,0.75,1.0,1.0,1.0


## Prediction and Evaluation

In [None]:
from sklearn.metrics import roc_auc_score

pred = model.predict(df_test_input)
pred

array([[0.0475443 ],
       [0.0405857 ],
       [0.03791597],
       ...,
       [0.04487786],
       [0.04613823],
       [0.03584486]], dtype=float32)

In [None]:
roc_auc_score(df_test_target, pred)

0.5951047294275492

## Test Total Loading and Evaluating

In [None]:
df_test_total = pd.read_csv('Midterm Exam/data/test_full.csv')
df_test_total.head()

Unnamed: 0,id,car_01_cat,car_02_cat,car_03_cat,car_04_cat,car_05,car_06,car_07_cat,car_08_cat,car_09,...,idx_14,idx_15_bin,idx_16_bin,idx_17,idx_18,idx_19,idx_20,loc_01,loc_02,loc_03
0,T000000,11,0,0,53,3.162278,0.357071,1,1,0.824621,...,11,1,0,0,4,6,2,0.7,0.9,1.147279
1,T000001,0,0,0,39,2.828427,-1.0,1,1,0.660922,...,11,0,0,2,3,6,3,0.2,0.5,0.67082
2,T000002,0,0,0,22,3.316625,0.374566,1,1,0.752269,...,8,0,0,3,4,3,3,0.9,0.9,1.477117
3,T000003,11,-1,0,104,0.0,0.378814,1,1,0.470273,...,9,1,0,2,2,8,2,0.0,0.0,0.571183
4,T000004,0,0,0,78,3.741657,0.328786,1,0,0.895316,...,9,0,0,1,2,6,3,0.3,0.9,0.75705


In [None]:
df_test_total_input = df_test_total.drop('id', axis=1)
df_test_total_input = scaler.transform(df_test_total_input)

pred = model.predict(df_test_total_input)
pred

array([[0.03700715],
       [0.03806365],
       [0.02600452],
       ...,
       [0.01717088],
       [0.03483874],
       [0.03789285]], dtype=float32)