In [1]:
import pandas as pd
import datetime

# Import cc data.

In [3]:
cc = pd.read_csv('data_pack/cc.csv')
cc['pos_dt'] = pd.to_datetime(cc['pos_dt']) # convert to datetime formatted.
cc.head()

Unnamed: 0,cc_no,pos_dt,cc_txn_amt
0,37069,2018-05-10,5000
1,37069,2018-06-04,12000
2,37069,2018-04-03,5000
3,37069,2018-04-22,1600
4,37069,2018-01-21,5000


## Add column week of month. [ cc data ]

In [4]:
week_of_month = [int(i.strftime("%V")) for i in cc['pos_dt']]
cc['week_of_month'] = week_of_month
cc.head(10)

Unnamed: 0,cc_no,pos_dt,cc_txn_amt,week_of_month
0,37069,2018-05-10,5000,19
1,37069,2018-06-04,12000,23
2,37069,2018-04-03,5000,14
3,37069,2018-04-22,1600,16
4,37069,2018-01-21,5000,3
5,37201,2018-04-13,400,15
6,37201,2018-04-19,300,16
7,37201,2018-01-15,600,3
8,37201,2018-06-13,1500,24
9,37201,2018-01-22,1300,4


## Create new dataframe that group by cc_no and week_of_month.

In [5]:
cc_wom = cc.groupby(['cc_no', 'week_of_month']).sum()
cc_wom = cc_wom.reset_index()
cc_wom.head()

Unnamed: 0,cc_no,week_of_month,cc_txn_amt
0,2,10,800
1,2,11,3800
2,2,17,18700
3,2,19,800
4,2,20,800


In [6]:
sorted(cc_wom.week_of_month.value_counts().index) # number of week

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26]

# Import Kplus Data

In [7]:
kplus = pd.read_csv('data_pack\kplus.csv')
kplus['sunday'] = pd.to_datetime(kplus['sunday']) # convert to datetime formatted.
kplus.head(10)

Unnamed: 0,id,sunday,kp_txn_count,kp_txn_amt
0,14802,2018-01-14,2,2400
1,14802,2018-04-01,9,33900
2,14802,2018-05-27,7,6100
3,14802,2018-01-28,3,8500
4,14802,2018-03-25,2,4000
5,14802,2018-05-06,12,35300
6,14802,2018-03-11,1,4500
7,14802,2018-01-07,6,20800
8,14802,2018-06-03,11,42900
9,14802,2018-02-04,11,49800


## Add column week of month. [ kplus data ]

In [8]:
kplus_week_of_month = [int(i.strftime("%V")) for i in kplus['sunday']]
kplus['week_of_month'] = kplus_week_of_month # add column week_of_month.
kplus.head(10)

Unnamed: 0,id,sunday,kp_txn_count,kp_txn_amt,week_of_month
0,14802,2018-01-14,2,2400,2
1,14802,2018-04-01,9,33900,13
2,14802,2018-05-27,7,6100,21
3,14802,2018-01-28,3,8500,4
4,14802,2018-03-25,2,4000,12
5,14802,2018-05-06,12,35300,18
6,14802,2018-03-11,1,4500,10
7,14802,2018-01-07,6,20800,1
8,14802,2018-06-03,11,42900,22
9,14802,2018-02-04,11,49800,5


In [9]:
sorted(kplus.week_of_month.value_counts().index) # number of week

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25]

# Import demographics data.

In [10]:
import random

In [11]:
demo = pd.read_csv('data_pack/demographics.csv')
demo = demo.fillna(value={'ocp_cd':random.choice([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 11.0, 12.0, 13.0])})
demo = demo.interpolate(method ='nearest', limit_direction ='forward')
demo.head(10)

Unnamed: 0,id,cc_no,gender,ocp_cd,age
0,1,1,2,9.0,5
1,1,98397,2,9.0,5
2,2,2,2,3.0,4
3,2,9740,2,3.0,4
4,3,3,2,1.0,3
5,4,4,2,3.0,5
6,4,86813,2,3.0,5
7,5,5,2,9.0,4
8,6,6,1,3.0,3
9,6,91379,1,3.0,3


In [12]:
sorted(demo.ocp_cd.value_counts().index)

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 11.0, 12.0, 13.0]

In [13]:
demo_cc = demo.sort_values('cc_no')
demo_cc = demo_cc.set_index('cc_no')

In [14]:
cc_wom.head()

Unnamed: 0,cc_no,week_of_month,cc_txn_amt
0,2,10,800
1,2,11,3800
2,2,17,18700
3,2,19,800
4,2,20,800


In [15]:
for i in range(1,27):
    tmp = cc_wom[cc_wom['week_of_month'] == i]
    tmp = tmp.set_index('cc_no')
    demo_cc['cc_week_'+str(i)] = tmp.cc_txn_amt

In [16]:
demo_f = demo_cc.groupby('id').mean()
demo_f.sample(10)

Unnamed: 0_level_0,gender,ocp_cd,age,cc_week_1,cc_week_2,cc_week_3,cc_week_4,cc_week_5,cc_week_6,cc_week_7,...,cc_week_17,cc_week_18,cc_week_19,cc_week_20,cc_week_21,cc_week_22,cc_week_23,cc_week_24,cc_week_25,cc_week_26
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20053,2,3.0,3,,,,,,,,...,,,,,,,,,,
1250,2,3.0,3,5350.0,4100.0,2050.0,3200.0,600.0,2500.0,1100.0,...,1600.0,2950.0,2500.0,3500.0,1800.0,5850.0,5950.0,7200.0,6400.0,3950.0
11176,1,3.0,3,,11200.0,,,3000.0,,,...,,,,,,,,,,
29909,1,3.0,3,,,1600.0,1300.0,500.0,500.0,2000.0,...,2600.0,3100.0,,,,1000.0,4400.0,2500.0,500.0,2800.0
24457,2,3.0,4,,,,,,,,...,,,,,,,,,,
7513,2,3.0,4,4500.0,,,,,14500.0,,...,,,,4000.0,,,,,3500.0,
56303,1,3.0,4,1900.0,11000.0,,17200.0,,,24300.0,...,9700.0,200.0,,,11500.0,100.0,,,,13000.0
55705,1,3.0,3,900.0,,2900.0,1000.0,900.0,800.0,,...,,2800.0,1100.0,800.0,1200.0,2100.0,,1800.0,1100.0,
50539,2,3.0,3,,79800.0,42000.0,23700.0,1300.0,,1200.0,...,2900.0,700.0,,2300.0,,12000.0,,,,1100.0
55752,2,3.0,4,,,,,1100.0,3100.0,28600.0,...,38400.0,4900.0,6000.0,14700.0,10000.0,,2000.0,,,2500.0


In [17]:
for i in range(1,26):
    tmp = kplus[kplus['week_of_month'] == i]
    tmp = tmp.set_index('id')
    demo_f['kp_week_'+str(i)] = tmp.kp_txn_amt

In [18]:
# demo_f = demo_f.fillna(0)
demo_f.head(10)
# demo_f = demo_f.interpolate(method ='linear', limit_direction='both')
# demo_f.head(10)

Unnamed: 0_level_0,gender,ocp_cd,age,cc_week_1,cc_week_2,cc_week_3,cc_week_4,cc_week_5,cc_week_6,cc_week_7,...,kp_week_16,kp_week_17,kp_week_18,kp_week_19,kp_week_20,kp_week_21,kp_week_22,kp_week_23,kp_week_24,kp_week_25
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2,9.0,5,,,4700.0,,,,20000.0,...,,,,,,,1500.0,,2000.0,
2,2,3.0,4,1600.0,,,3500.0,2600.0,800.0,,...,4200.0,26900.0,20600.0,1500.0,1100.0,1700.0,27700.0,1600.0,1600.0,40300.0
3,2,1.0,3,,,,,,,,...,5000.0,17400.0,300.0,10200.0,1900.0,4000.0,19900.0,100.0,200.0,2500.0
4,2,3.0,5,,,,,,,,...,678400.0,217700.0,430300.0,87900.0,158400.0,52400.0,290400.0,500200.0,225200.0,394800.0
5,2,9.0,4,,,5000.0,2000.0,,5000.0,,...,,,,,,,,,,
6,1,3.0,3,,,,,,,,...,,,,,,,12200.0,200.0,1000.0,
7,1,3.0,5,850.0,,600.0,,,,1450.0,...,9700.0,6400.0,71300.0,16800.0,14600.0,4300.0,91400.0,16900.0,9200.0,37900.0
8,2,9.0,3,,,4900.0,,,,2000.0,...,24000.0,25900.0,54400.0,34300.0,18600.0,30400.0,31000.0,32200.0,22700.0,27700.0
9,1,9.0,4,,,,2000.0,,10000.0,,...,,,2300.0,100.0,,900.0,300.0,,200.0,
10,1,3.0,3,,,,,,,,...,500.0,,500.0,2500.0,2000.0,8000.0,,30000.0,500.0,9900.0


# Extract more feture

In [19]:
demo_f.head()

Unnamed: 0_level_0,gender,ocp_cd,age,cc_week_1,cc_week_2,cc_week_3,cc_week_4,cc_week_5,cc_week_6,cc_week_7,...,kp_week_16,kp_week_17,kp_week_18,kp_week_19,kp_week_20,kp_week_21,kp_week_22,kp_week_23,kp_week_24,kp_week_25
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2,9.0,5,,,4700.0,,,,20000.0,...,,,,,,,1500.0,,2000.0,
2,2,3.0,4,1600.0,,,3500.0,2600.0,800.0,,...,4200.0,26900.0,20600.0,1500.0,1100.0,1700.0,27700.0,1600.0,1600.0,40300.0
3,2,1.0,3,,,,,,,,...,5000.0,17400.0,300.0,10200.0,1900.0,4000.0,19900.0,100.0,200.0,2500.0
4,2,3.0,5,,,,,,,,...,678400.0,217700.0,430300.0,87900.0,158400.0,52400.0,290400.0,500200.0,225200.0,394800.0
5,2,9.0,4,,,5000.0,2000.0,,5000.0,,...,,,,,,,,,,


In [20]:
ocp_1 = [1 if i == 1 else 0 for i in demo_f.ocp_cd]
ocp_2 = [1 if i == 2 else 0 for i in demo_f.ocp_cd]
ocp_3 = [1 if i == 3 else 0 for i in demo_f.ocp_cd]
ocp_4 = [1 if i == 4 else 0 for i in demo_f.ocp_cd]
ocp_5 = [1 if i == 5 else 0 for i in demo_f.ocp_cd]
ocp_6 = [1 if i == 6 else 0 for i in demo_f.ocp_cd]
ocp_7 = [1 if i == 7 else 0 for i in demo_f.ocp_cd]
ocp_8 = [1 if i == 8 else 0 for i in demo_f.ocp_cd]
ocp_9 = [1 if i == 9 else 0 for i in demo_f.ocp_cd]
ocp_11 = [1 if i == 11 else 0 for i in demo_f.ocp_cd]
ocp_12 = [1 if i == 12 else 0 for i in demo_f.ocp_cd]
ocp_13 = [1 if i == 13 else 0 for i in demo_f.ocp_cd]

In [21]:
demo_f.drop('ocp_cd', inplace=True, axis=1)

In [22]:
demo_f['ocp_1'] = ocp_1
demo_f['ocp_2'] = ocp_2
demo_f['ocp_3'] = ocp_3
demo_f['ocp_4'] = ocp_4
demo_f['ocp_5'] = ocp_5
demo_f['ocp_6'] = ocp_6
demo_f['ocp_7'] = ocp_7
demo_f['ocp_8'] = ocp_8
demo_f['ocp_9'] = ocp_9
demo_f['ocp_11'] = ocp_11
demo_f['ocp_12'] = ocp_12
demo_f['ocp_13'] = ocp_13

In [23]:
demo_f.head()

Unnamed: 0_level_0,gender,age,cc_week_1,cc_week_2,cc_week_3,cc_week_4,cc_week_5,cc_week_6,cc_week_7,cc_week_8,...,ocp_3,ocp_4,ocp_5,ocp_6,ocp_7,ocp_8,ocp_9,ocp_11,ocp_12,ocp_13
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2,5,,,4700.0,,,,20000.0,,...,0,0,0,0,0,0,1,0,0,0
2,2,4,1600.0,,,3500.0,2600.0,800.0,,3800.0,...,1,0,0,0,0,0,0,0,0,0
3,2,3,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
4,2,5,,,,,,,,,...,1,0,0,0,0,0,0,0,0,0
5,2,4,,,5000.0,2000.0,,5000.0,,,...,0,0,0,0,0,0,1,0,0,0


# Fill NaN with median of each job

In [24]:
demo_f.columns

Index(['gender', 'age', 'cc_week_1', 'cc_week_2', 'cc_week_3', 'cc_week_4',
       'cc_week_5', 'cc_week_6', 'cc_week_7', 'cc_week_8', 'cc_week_9',
       'cc_week_10', 'cc_week_11', 'cc_week_12', 'cc_week_13', 'cc_week_14',
       'cc_week_15', 'cc_week_16', 'cc_week_17', 'cc_week_18', 'cc_week_19',
       'cc_week_20', 'cc_week_21', 'cc_week_22', 'cc_week_23', 'cc_week_24',
       'cc_week_25', 'cc_week_26', 'kp_week_1', 'kp_week_2', 'kp_week_3',
       'kp_week_4', 'kp_week_5', 'kp_week_6', 'kp_week_7', 'kp_week_8',
       'kp_week_9', 'kp_week_10', 'kp_week_11', 'kp_week_12', 'kp_week_13',
       'kp_week_14', 'kp_week_15', 'kp_week_16', 'kp_week_17', 'kp_week_18',
       'kp_week_19', 'kp_week_20', 'kp_week_21', 'kp_week_22', 'kp_week_23',
       'kp_week_24', 'kp_week_25', 'ocp_1', 'ocp_2', 'ocp_3', 'ocp_4', 'ocp_5',
       'ocp_6', 'ocp_7', 'ocp_8', 'ocp_9', 'ocp_11', 'ocp_12', 'ocp_13'],
      dtype='object')

In [25]:
demo_f

Unnamed: 0_level_0,gender,age,cc_week_1,cc_week_2,cc_week_3,cc_week_4,cc_week_5,cc_week_6,cc_week_7,cc_week_8,...,ocp_3,ocp_4,ocp_5,ocp_6,ocp_7,ocp_8,ocp_9,ocp_11,ocp_12,ocp_13
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2,5,,,4700.0,,,,20000.0,,...,0,0,0,0,0,0,1,0,0,0
2,2,4,1600.0,,,3500.0,2600.0,800.0,,3800.0,...,1,0,0,0,0,0,0,0,0,0
3,2,3,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
4,2,5,,,,,,,,,...,1,0,0,0,0,0,0,0,0,0
5,2,4,,,5000.0,2000.0,,5000.0,,,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64996,2,2,,,,,,,,,...,1,0,0,0,0,0,0,0,0,0
64997,2,2,,,,,,,,,...,1,0,0,0,0,0,0,0,0,0
64998,1,3,,,,,,,,,...,0,0,0,0,0,0,1,0,0,0
64999,2,3,,900.0,,1000.0,,,,2400.0,...,1,0,0,0,0,0,0,0,0,0


In [26]:
cc_week = ['cc_week_1', 'cc_week_2', 'cc_week_3', 'cc_week_4',
       'cc_week_5', 'cc_week_6', 'cc_week_7', 'cc_week_8', 'cc_week_9',
       'cc_week_10', 'cc_week_11', 'cc_week_12', 'cc_week_13', 'cc_week_14',
       'cc_week_15', 'cc_week_16', 'cc_week_17', 'cc_week_18', 'cc_week_19',
       'cc_week_20', 'cc_week_21', 'cc_week_22', 'cc_week_23', 'cc_week_24',
       'cc_week_25', 'cc_week_26']

lst_ocp = ['ocp_1', 'ocp_2', 'ocp_3', 'ocp_4', 'ocp_5',
       'ocp_6', 'ocp_7', 'ocp_8', 'ocp_9', 'ocp_11', 'ocp_12', 'ocp_13']

In [28]:
kp_week = ['kp_week_1', 'kp_week_2', 'kp_week_3',
       'kp_week_4', 'kp_week_5', 'kp_week_6', 'kp_week_7', 'kp_week_8',
       'kp_week_9', 'kp_week_10', 'kp_week_11', 'kp_week_12', 'kp_week_13',
       'kp_week_14', 'kp_week_15', 'kp_week_16', 'kp_week_17', 'kp_week_18',
       'kp_week_19', 'kp_week_20', 'kp_week_21', 'kp_week_22', 'kp_week_23',
       'kp_week_24', 'kp_week_25']

In [27]:
for i in cc_week:
    for j in lst_ocp:
        demo_f[i] = demo_f.groupby([j])[i].apply(lambda x: x.fillna(x.median()))

In [31]:
for i in kp_week:
    for j in lst_ocp:
        demo_f[i] = demo_f.groupby([j])[i].apply(lambda x: x.fillna(x.median()))

In [32]:
# demo_f.isna().sum()

In [33]:
demo_f.head()

Unnamed: 0_level_0,gender,age,cc_week_1,cc_week_2,cc_week_3,cc_week_4,cc_week_5,cc_week_6,cc_week_7,cc_week_8,...,ocp_3,ocp_4,ocp_5,ocp_6,ocp_7,ocp_8,ocp_9,ocp_11,ocp_12,ocp_13
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2,5,2200.0,2000.0,4700.0,2000.0,2100.0,2000.0,20000.0,1900.0,...,0,0,0,0,0,0,1,0,0,0
2,2,4,1600.0,2000.0,1900.0,3500.0,2600.0,800.0,2000.0,3800.0,...,1,0,0,0,0,0,0,0,0,0
3,2,3,2100.0,1900.0,2000.0,1800.0,2050.0,2200.0,2100.0,1900.0,...,0,0,0,0,0,0,0,0,0,0
4,2,5,2200.0,2000.0,1900.0,2000.0,2100.0,2000.0,2000.0,1900.0,...,1,0,0,0,0,0,0,0,0,0
5,2,4,2200.0,2000.0,5000.0,2000.0,2100.0,5000.0,2000.0,1900.0,...,0,0,0,0,0,0,1,0,0,0


----------

# Grid search

In [59]:
from sklearn.model_selection import GridSearchCV

# Create model Random forest.

In [47]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

In [48]:
demo_train = demo_f.loc[1:50000]
demo_test = demo_f.loc[50001:]

In [49]:
train = pd.read_csv('data_pack/train.csv')

In [50]:
regr = RandomForestRegressor(max_depth=100, n_estimators = 100, random_state = 0, n_jobs= 2)
regr.fit(demo_train, train.income)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=100,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                      oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [51]:
regr.score(demo_train, train.income)

0.8622297416917784

In [52]:
print(regr.feature_importances_)

[1.28838869e-02 4.95449135e-02 1.42140753e-02 1.48983657e-02
 1.31518977e-02 1.34225133e-02 1.48027922e-02 1.53154940e-02
 1.53627178e-02 1.50160121e-02 1.57212570e-02 1.78262011e-02
 1.58267214e-02 1.50945638e-02 2.38375673e-02 1.78183026e-02
 1.75963089e-02 1.68550085e-02 1.58508540e-02 1.45929366e-02
 1.54649826e-02 1.59792101e-02 1.52808158e-02 1.69856743e-02
 1.86706992e-02 1.79359831e-02 2.54261129e-02 1.74863701e-02
 1.58475332e-02 1.13953128e-02 1.13712115e-02 1.53565036e-02
 1.67903224e-02 1.14265787e-02 1.22993526e-02 1.19640623e-02
 1.77344180e-02 1.19058650e-02 1.18852916e-02 1.18078432e-02
 1.77857644e-02 1.36096567e-02 1.25054109e-02 1.35287282e-02
 2.79395361e-02 2.21628926e-02 1.55403389e-02 1.47428194e-02
 2.91238422e-02 8.00289832e-02 2.22854459e-02 1.63162193e-02
 1.94283448e-02 5.40802495e-04 1.48557385e-03 2.50447349e-03
 2.65234705e-02 1.25051161e-03 9.20342371e-05 2.08650302e-03
 7.02753917e-05 3.67966586e-03 3.93754050e-04 1.73015028e-03
 1.99826953e-03]


In [53]:
print(regr.predict(demo_test))

[29120.         24900.         36180.         ... 40550.
 32970.         22529.81795209]


In [55]:
test = pd.read_csv('data_pack/test.csv')
test['income'] = regr.predict(demo_test)

In [57]:
test.head()

Unnamed: 0,id,income
0,50001,29120.0
1,50002,24900.0
2,50003,36180.0
3,50004,34120.0
4,50005,43380.0


In [58]:
test.tail()

Unnamed: 0,id,income
14995,64996,21790.0
14996,64997,40140.0
14997,64998,40550.0
14998,64999,32970.0
14999,65000,22529.817952


In [56]:
# test.to_csv('test_indy_06.csv', index=False)

# TPOD Auto model.

In [60]:
from tpot import TPOTClassifier
from tpot import TPOTRegressor

tpot = TPOTClassifier(generations=5,verbosity=2, n_jobs=15)

tpot.fit(demo_train, train.income)



HBox(children=(IntProgress(value=0, description='Optimization Progress', max=600, style=ProgressStyle(descript…



TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.


RuntimeError: There was an error in the TPOT optimization process. This could be because the data was not formatted properly, or because data for a regression problem was provided to the TPOTClassifier object. Please make sure you passed the data to TPOT correctly.