In [265]:
import pandas as pd
import random
import datetime

# Import cc data.

In [266]:
cc = pd.read_csv('data_pack/cc.csv')
cc['pos_dt'] = pd.to_datetime(cc['pos_dt']) # convert to datetime formatted.
cc.head()

Unnamed: 0,cc_no,pos_dt,cc_txn_amt
0,37069,2018-05-10,5000
1,37069,2018-06-04,12000
2,37069,2018-04-03,5000
3,37069,2018-04-22,1600
4,37069,2018-01-21,5000


## Add column week of month. [ cc data ]

In [267]:
week_of_month = [int(i.strftime("%V")) for i in cc['pos_dt']]
cc['week_of_month'] = week_of_month
cc.head(10)

Unnamed: 0,cc_no,pos_dt,cc_txn_amt,week_of_month
0,37069,2018-05-10,5000,19
1,37069,2018-06-04,12000,23
2,37069,2018-04-03,5000,14
3,37069,2018-04-22,1600,16
4,37069,2018-01-21,5000,3
5,37201,2018-04-13,400,15
6,37201,2018-04-19,300,16
7,37201,2018-01-15,600,3
8,37201,2018-06-13,1500,24
9,37201,2018-01-22,1300,4


## Create new dataframe that group by cc_no and week_of_month.

In [268]:
cc_wom = cc.groupby(['cc_no', 'week_of_month']).sum()
cc_wom = cc_wom.reset_index()
cc_wom.head()

Unnamed: 0,cc_no,week_of_month,cc_txn_amt
0,2,10,800
1,2,11,3800
2,2,17,18700
3,2,19,800
4,2,20,800


In [269]:
sorted(cc_wom.week_of_month.value_counts().index) # number of week

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26]

# Import Kplus Data

In [270]:
kplus = pd.read_csv('data_pack\kplus.csv')
kplus['sunday'] = pd.to_datetime(kplus['sunday']) # convert to datetime formatted.
kplus.head(10)

Unnamed: 0,id,sunday,kp_txn_count,kp_txn_amt
0,14802,2018-01-14,2,2400
1,14802,2018-04-01,9,33900
2,14802,2018-05-27,7,6100
3,14802,2018-01-28,3,8500
4,14802,2018-03-25,2,4000
5,14802,2018-05-06,12,35300
6,14802,2018-03-11,1,4500
7,14802,2018-01-07,6,20800
8,14802,2018-06-03,11,42900
9,14802,2018-02-04,11,49800


## Add column week of month. [ kplus data ]

In [271]:
kplus_week_of_month = [int(i.strftime("%V")) for i in kplus['sunday']]
kplus['week_of_month'] = kplus_week_of_month # add column week_of_month.
kplus.head(10)

Unnamed: 0,id,sunday,kp_txn_count,kp_txn_amt,week_of_month
0,14802,2018-01-14,2,2400,2
1,14802,2018-04-01,9,33900,13
2,14802,2018-05-27,7,6100,21
3,14802,2018-01-28,3,8500,4
4,14802,2018-03-25,2,4000,12
5,14802,2018-05-06,12,35300,18
6,14802,2018-03-11,1,4500,10
7,14802,2018-01-07,6,20800,1
8,14802,2018-06-03,11,42900,22
9,14802,2018-02-04,11,49800,5


In [272]:
sorted(kplus.week_of_month.value_counts().index) # number of week

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25]

# Import demographics data.

In [273]:
demo = pd.read_csv('data_pack/demographics.csv')
demo = demo.fillna(value={'ocp_cd':random.choice([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 11.0, 12.0, 13.0])})
demo = demo.interpolate(method ='nearest', limit_direction ='forward')
demo.head(10)

Unnamed: 0,id,cc_no,gender,ocp_cd,age
0,1,1,2,9.0,5
1,1,98397,2,9.0,5
2,2,2,2,3.0,4
3,2,9740,2,3.0,4
4,3,3,2,1.0,3
5,4,4,2,3.0,5
6,4,86813,2,3.0,5
7,5,5,2,9.0,4
8,6,6,1,3.0,3
9,6,91379,1,3.0,3


In [274]:
sorted(demo.ocp_cd.value_counts().index)

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 11.0, 12.0, 13.0]

In [275]:
demo_cc = demo.sort_values('cc_no')
demo_cc = demo_cc.set_index('cc_no')

In [276]:
cc_wom.head()

Unnamed: 0,cc_no,week_of_month,cc_txn_amt
0,2,10,800
1,2,11,3800
2,2,17,18700
3,2,19,800
4,2,20,800


In [277]:
for i in range(1,27):
    tmp = cc_wom[cc_wom['week_of_month'] == i]
    tmp = tmp.set_index('cc_no')
    demo_cc['cc_week_'+str(i)] = tmp.cc_txn_amt

In [278]:
demo_f = demo_cc.groupby('id').mean()
demo_f.sample(10)

Unnamed: 0_level_0,gender,ocp_cd,age,cc_week_1,cc_week_2,cc_week_3,cc_week_4,cc_week_5,cc_week_6,cc_week_7,...,cc_week_17,cc_week_18,cc_week_19,cc_week_20,cc_week_21,cc_week_22,cc_week_23,cc_week_24,cc_week_25,cc_week_26
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
46687,2,3.0,3,,,,,,,,...,,,,,,,,,,
40308,1,9.0,3,,,,,300.0,,,...,1200.0,1200.0,1000.0,1400.0,,3600.0,1300.0,,700.0,
59671,1,9.0,3,600.0,1800.0,1000.0,,1700.0,400.0,1400.0,...,100.0,,1800.0,500.0,500.0,500.0,400.0,1400.0,500.0,
21459,2,9.0,4,,,,,,,,...,,,,,,,,,,
46422,1,3.0,4,3000.0,,,,,3000.0,,...,,,2000.0,400.0,,400.0,2500.0,,500.0,
23568,2,9.0,3,1300.0,,300.0,300.0,1400.0,700.0,800.0,...,1650.0,1500.0,1300.0,,1800.0,1400.0,1100.0,2300.0,300.0,1800.0
36405,1,9.0,4,10000.0,5000.0,,,,,,...,,,1100.0,,,,800.0,,,
27164,1,3.0,3,200.0,,5400.0,4600.0,35400.0,9850.0,6750.0,...,,3200.0,11500.0,2100.0,5900.0,5300.0,3300.0,1800.0,1800.0,
46942,1,3.0,3,,,,,,,,...,,,,,,,32400.0,,,
56154,2,9.0,5,,,,,,,,...,,,,,,,,,,


In [279]:
for i in range(1,26):
    tmp = kplus[kplus['week_of_month'] == i]
    tmp = tmp.set_index('id')
    demo_f['kp_week_'+str(i)] = tmp.kp_txn_amt

### Extract more feture [ onehot ocp_cc ]

In [288]:
demo_f.ocp_cd = pd.to_numeric(demo_f.ocp_cd, downcast='integer')

In [290]:
onehot_ocp = pd.get_dummies(demo_f.ocp_cd, prefix='ocp', dtype=int)

In [291]:
onehot_ocp

Unnamed: 0_level_0,ocp_1,ocp_2,ocp_3,ocp_4,ocp_5,ocp_6,ocp_7,ocp_8,ocp_9,ocp_11,ocp_12,ocp_13
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
64996,0,0,1,0,0,0,0,0,0,0,0,0
64997,0,0,1,0,0,0,0,0,0,0,0,0
64998,0,0,0,0,0,0,0,0,1,0,0,0
64999,0,0,1,0,0,0,0,0,0,0,0,0


In [292]:
demo_onehot = pd.concat([demo_f, onehot_ocp], axis=1)

In [293]:
demo_onehot

Unnamed: 0_level_0,gender,ocp_cd,age,cc_week_1,cc_week_2,cc_week_3,cc_week_4,cc_week_5,cc_week_6,cc_week_7,...,ocp_3,ocp_4,ocp_5,ocp_6,ocp_7,ocp_8,ocp_9,ocp_11,ocp_12,ocp_13
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2,9,5,,,4700.0,,,,20000.0,...,0,0,0,0,0,0,1,0,0,0
2,2,3,4,1600.0,,,3500.0,2600.0,800.0,,...,1,0,0,0,0,0,0,0,0,0
3,2,1,3,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
4,2,3,5,,,,,,,,...,1,0,0,0,0,0,0,0,0,0
5,2,9,4,,,5000.0,2000.0,,5000.0,,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64996,2,3,2,,,,,,,,...,1,0,0,0,0,0,0,0,0,0
64997,2,3,2,,,,,,,,...,1,0,0,0,0,0,0,0,0,0
64998,1,9,3,,,,,,,,...,0,0,0,0,0,0,1,0,0,0
64999,2,3,3,,900.0,,1000.0,,,,...,1,0,0,0,0,0,0,0,0,0


### Fill na with zero

In [294]:
demo_f = demo_onehot.fillna(0)

### Drop ocp_cc

In [295]:
demo_f.drop('ocp_cd', axis=1, inplace=True)

In [296]:
demo_f.head()

Unnamed: 0_level_0,gender,age,cc_week_1,cc_week_2,cc_week_3,cc_week_4,cc_week_5,cc_week_6,cc_week_7,cc_week_8,...,ocp_3,ocp_4,ocp_5,ocp_6,ocp_7,ocp_8,ocp_9,ocp_11,ocp_12,ocp_13
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2,5,0.0,0.0,4700.0,0.0,0.0,0.0,20000.0,0.0,...,0,0,0,0,0,0,1,0,0,0
2,2,4,1600.0,0.0,0.0,3500.0,2600.0,800.0,0.0,3800.0,...,1,0,0,0,0,0,0,0,0,0
3,2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
5,2,4,0.0,0.0,5000.0,2000.0,0.0,5000.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0


In [297]:
demo_f.tail()

Unnamed: 0_level_0,gender,age,cc_week_1,cc_week_2,cc_week_3,cc_week_4,cc_week_5,cc_week_6,cc_week_7,cc_week_8,...,ocp_3,ocp_4,ocp_5,ocp_6,ocp_7,ocp_8,ocp_9,ocp_11,ocp_12,ocp_13
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
64996,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
64997,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
64998,1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
64999,2,3,0.0,900.0,0.0,1000.0,0.0,0.0,0.0,2400.0,...,1,0,0,0,0,0,0,0,0,0
65000,2,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0


----------

# Prepare data for tarin and test.

In [298]:
# import train data
train = pd.read_csv('data_pack/train.csv')

In [299]:
X_train = demo_f.loc[0:45000]
X_test = demo_f.loc[45001:50000]

y_train = train.income.loc[0:44999]
y_test = train.income.loc[45000:50000]

In [300]:
y_train

0         20000
1        106000
2         29000
3         61000
4         18000
          ...  
44995     21000
44996     19000
44997     22000
44998     25000
44999     22000
Name: income, Length: 45000, dtype: int64

In [301]:
demo_test = demo_f.loc[50001:]

----------

# TPOD Auto model.

In [None]:
# from tpot import TPOTClassifier
from tpot import TPOTRegressor

tpot = TPOTRegressor(n_jobs=-1)

tpot.fit(X_train, y_train)

In [None]:
print(tpot.score(X_test, y_test))

In [None]:
# tpot.score(demo_train, train.income)

In [None]:
test = pd.read_csv('data_pack/test.csv')
test['income'] = tpot.predict(demo_test)

In [None]:
test.head()

In [None]:
# test.to_csv('tpot02.csv', index=False)