## Customer Transaction Prediction

이 프로젝트의 목표는 고객들의 히스토리 데이터를 분석하여, 고객의 거래 된 금액에 관계없이, 향후 어떤 고객이 특정 거래를 할 것인지 파악할 수 있도록 하는것이 목표입니다. `target`이 0이면 거래를 할 수 없다는 것을 의미하고, 1이면 거래를 할 수 있다는 것을 의미합니다.

 - Target의 분포를 살펴보면 0과 1의 비율이 맞지 않습니다. 즉 imbalance 데이터 입니다. 이런 데이터를 다루는 기법중의 하나는 두 데이터의 크기를 1:1로 맞추는 것입니다. 두 카테고리의 데이터 수가 맞도록 샘플링 하세요.
 - 우리의 목표는 주어진 데이터를 활용하여 즉 `var_0` ~ `var_99`를 이용하여 `target`을 예측하는 것입니다. 분류기중에서는 우리가 배웠던 naive bayes를 활용합니다.

## 1. Import Dataset

In [2]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [121]:
transaction = pd.read_csv('train.csv')
transaction.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [122]:
transaction.shape

(200000, 202)

## 2. Exploratory Data Analysis

* Check if there are any missing values
* Observe distribution of each features in negative and positive dataset

In [123]:
transaction.isnull().values.any()

False

In [124]:
print("Number of False values : " + str(sum(transaction.target == 0)))
print("Number of True values : " + str(sum(transaction.target == 1)))

Number of False values : 179902
Number of True values : 20098


In [125]:
#describing data where target ==0
transaction[transaction.target == 0].head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [126]:
#describing data where target ==1
transaction[transaction.target == 1].head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
13,train_13,1,16.3699,1.5934,16.7395,7.333,12.145,5.9004,4.8222,20.9729,...,7.4002,7.4031,4.3989,4.0978,17.3638,-1.3022,9.6846,9.0419,15.6064,-10.8529
29,train_29,1,5.3301,-2.6064,13.1913,3.1193,6.6483,-6.5659,5.9064,15.2341,...,5.9215,7.9676,2.3405,1.1482,23.2168,-2.0105,3.76,9.4513,17.4105,-14.6897
63,train_63,1,7.7072,0.0183,9.9974,8.3524,9.2886,-13.3627,6.0425,10.1108,...,1.5832,5.0039,3.8814,7.4241,21.4844,-0.8297,-3.0468,7.579,15.7685,5.4769
65,train_65,1,10.5358,-2.5439,8.7394,6.7548,14.4099,-3.8724,5.1584,15.8381,...,4.6648,6.4227,3.4025,-4.0882,14.1174,-0.2472,5.3847,8.6949,15.134,3.8449
71,train_71,1,6.7547,2.5973,14.2141,8.3514,7.4942,-1.3055,4.2336,15.0243,...,1.4677,3.5935,2.0013,1.5777,18.282,-4.3408,6.8869,9.3567,18.9013,13.3447


There are no missing values, and the overall distribution of each features in both neg/pos dataset look normal

## 3. Data Preprocessing

* Remove ID_code, and separate target values
* We have an imbalance of data, which means we have to manually distribute each positive and negative data so their proportion becomes 1:1
* There are **20098** True values
* There are **179902** False values

Solution : We will sample 20,000 from each positive and negative dataset (very Naive)

In [127]:
#drop ID_code column (it is unnecessary)
transaction.drop('ID_code', axis=1, inplace=True)

In [128]:
#preview
transaction.head()

Unnamed: 0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [129]:
#sample 20,000 data from positive and negative data
# neg_tran 
pos_train = transaction[transaction.target == 1].sample(20000)
neg_train = transaction[transaction.target == 0].sample(20000)

In [130]:
print(neg_tran.shape)
print(pos_tran.shape)

(20000, 202)
(20000, 202)


In [133]:
#combine positive and negative dataset
train_x = pd.concat([pos_train, neg_train], axis=0)

In [134]:
train_x.head()

Unnamed: 0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
101725,1,9.9711,0.8502,14.1119,2.8107,9.2741,-10.6025,4.7811,12.5217,-0.5046,...,10.9389,6.5263,2.2801,1.5844,20.4949,0.4472,3.6463,6.7398,20.5159,-10.1031
151394,1,12.8135,-2.7276,12.3384,4.2318,11.3298,1.4456,4.2775,15.0577,3.8282,...,7.4779,9.2331,-0.7113,2.9214,16.5525,0.0095,-0.5272,9.4357,12.7172,2.1021
8467,1,7.2776,4.0098,12.4579,9.321,10.8479,-2.6865,5.5857,14.8405,-0.3773,...,2.7245,12.2529,2.3945,1.6155,25.8199,0.6721,9.8138,10.3855,18.0504,11.1453
144392,1,16.5156,-4.4128,9.3478,6.7104,10.5278,8.2418,5.1777,13.5222,-0.8127,...,0.243,4.64,2.1277,3.1163,18.6902,-2.1615,-0.4428,8.8983,13.8591,-9.6902
141138,1,5.6468,-2.4979,10.9132,5.9008,11.0756,9.5846,7.2754,17.9092,1.1877,...,-1.0542,6.7948,2.4716,5.8313,14.3246,0.1338,1.4132,9.9377,18.0817,-0.9618


In [135]:
print("Size of train data : " + str(train_x.shape))

Size of train data : (40000, 201)


In [136]:
train_y = train_x.pop('target')

In [137]:
train_y.head()

101725    1
151394    1
8467      1
144392    1
141138    1
Name: target, dtype: int64

## 4. Split Training/Test Data

In [138]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(train_x, train_y, test_size=0.3, random_state=42)

In [142]:
print("train x size : " + str(train_x.shape))
print("train y size : " + str(train_y.shape))
print("test x size : " + str(test_x.shape))
print("test y size : " + str(test_y.shape))

train x size : (28000, 200)
train y size : (28000,)
test x size : (12000, 200)
test y size : (12000,)


## 5. Train Data

In [143]:
from sklearn.preprocessing import StandardScaler

In [144]:
scaler = StandardScaler()

# scale training data
scaler = scaler.fit(train_x)
scaled_train_x = scaler.transform(train_x)

# scale test data
scaled_test_x = scaler.transform(test_x)

In [145]:
#double check shape
scaled_train_x.shape

(28000, 200)

In [146]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb_model = gnb.fit(scaled_train_x, train_y)

In [151]:
#test accuracy from train dataset
train_pred = gnb_model.predict(scaled_train_x)

In [152]:
(train_pred == train_y).sum() / len(train_y)

0.8096428571428571

## 6. Test Model

In [153]:
pred = gnb_model.predict(scaled_test_x)

In [155]:
(pred == test_y).sum() / len(test_y)

0.8074166666666667