In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import xgboost as xgb




In [2]:
members = pd.read_csv("/mnt/volume-sgp1-02/members.csv",parse_dates=['registration_init_time','expiration_date'], dtype={'city': np.int8, 'bd': np.int16, 'registered_via': np.int8})
train = pd.read_csv('/mnt/volume-sgp1-02/train.csv', dtype={'is_churn' : np.int8})
transactions = pd.read_csv('/mnt/volume-sgp1-02/transactions.csv', parse_dates=['transaction_date','membership_expire_date'], dtype={'payment_method_id': np.int8, 'payment_plan_days': np.int16, 'plan_list_price': np.int16, 'actual_amount_paid': np.int16, 'is_auto_renew': np.int8, 'is_cancel': np.int8})

In [3]:
test = pd.read_csv("/mnt/volume-sgp1-02/sample_submission_zero.csv")

In [4]:
#map info to each user in the train set
df_train = pd.merge(train,members,how='left', on="msno")
df_train.head()

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,expiration_date
0,waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1,18.0,36.0,female,9.0,2005-04-06,2017-09-07
1,QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=,1,10.0,38.0,male,9.0,2005-04-07,2017-03-21
2,fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=,1,11.0,27.0,female,9.0,2005-10-16,2017-02-03
3,mT5V8rEpa+8wuqi6x0DoVd3H5icMKkE9Prt49UlmK+4=,1,13.0,23.0,female,9.0,2005-11-02,2017-09-26
4,XaPhtGLk/5UvvOYHcONTwsnH97P4eGECeq+BARGItRw=,1,3.0,27.0,male,9.0,2005-12-28,2017-09-27


In [5]:
transactions.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,2015-09-30,2015-11-01,0
1,AZtu6Wl0gPojrEQYB8Q3vBSmE2wnZ3hi1FbK1rQQ0A4=,41,30,149,149,1,2015-09-30,2015-10-31,0
2,UkDFI97Qb6+s2LWcijVVv4rMAsORbVDT2wNXF0aVbns=,41,30,129,129,1,2015-09-30,2016-04-27,0
3,M1C56ijxozNaGD0t2h68PnH2xtx5iO5iR2MVYQB6nBI=,39,30,149,149,1,2015-09-30,2015-11-28,0
4,yvj6zyBUaqdbUQSrKsrZ+xNDVM62knauSZJzakS9OW4=,39,30,149,149,1,2015-09-30,2015-11-21,0


In [6]:
#get those with balance remaining
transactions["paid_diff"] = transactions["plan_list_price"]-transactions["actual_amount_paid"]
transactions.head(10)

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,paid_diff
0,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,2015-09-30,2015-11-01,0,0
1,AZtu6Wl0gPojrEQYB8Q3vBSmE2wnZ3hi1FbK1rQQ0A4=,41,30,149,149,1,2015-09-30,2015-10-31,0,0
2,UkDFI97Qb6+s2LWcijVVv4rMAsORbVDT2wNXF0aVbns=,41,30,129,129,1,2015-09-30,2016-04-27,0,0
3,M1C56ijxozNaGD0t2h68PnH2xtx5iO5iR2MVYQB6nBI=,39,30,149,149,1,2015-09-30,2015-11-28,0,0
4,yvj6zyBUaqdbUQSrKsrZ+xNDVM62knauSZJzakS9OW4=,39,30,149,149,1,2015-09-30,2015-11-21,0,0
5,KN7I82kjY0Tn76Ny95ncqBUVbO7b8AXrOPqZutKpxIM=,21,30,149,149,1,2015-09-30,2015-11-07,0,0
6,m5ptKif9BjdUghHXXomSezy5ohJiHm85PE13f/3kQaw=,39,30,149,149,1,2015-09-30,2015-11-28,0,0
7,uQxbyACsPOEkTIrv9jZgoGXelGBW81ZsSZKy9fhj5Z8=,39,30,149,149,1,2015-09-30,2015-11-25,0,0
8,LUPRfoE2r3WwVWhYO/TqQhjrL/qP6CO+/ORUlr7yNc0=,39,30,149,149,1,2015-09-30,2015-12-22,0,0
9,pMVjPLgVknaJYm9L0xUwxA2jmhpK7gfhGnsmmwX1mrE=,39,30,149,149,1,2015-09-30,2015-11-18,0,0


In [7]:
#aggregate information per transaction
#payment_method_id first,is_auto_renew,is_cancel ->most used
#msno->count
#payment_plan_days,plan_list_price,actual_amount_paid,paid_diff -> sum
#transaction_date->min
#membership_expire_date,is_cancel->max
transactions_per_user = transactions.groupby(["msno"]).agg({"payment_method_id":lambda x:x.value_counts().index[0] ,"msno":'count',"payment_plan_days":"sum","plan_list_price":"sum","actual_amount_paid":"sum","is_auto_renew":lambda x:x.value_counts().index[0],"transaction_date":"min","membership_expire_date":"max","is_cancel":"max","paid_diff":"sum"})
transactions_per_user.head()

Unnamed: 0_level_0,is_cancel,payment_method_id,membership_expire_date,actual_amount_paid,transaction_date,is_auto_renew,plan_list_price,paid_diff,payment_plan_days,msno
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
+++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY=,0,35,2016-09-14,0,2016-09-09,0,0,0,7,1
+++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=,0,38,2017-01-04,1788,2015-11-21,0,1788,0,410,1
+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,0,41,2017-03-15,396,2016-11-16,1,396,0,120,4
+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,39,2017-03-19,2831,2015-01-31,1,2682,-149,543,19
+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,0,41,2017-03-26,3874,2015-01-26,1,3725,-149,750,26


In [10]:
#get how long the user has subscribed
transactions_per_user["duration"]=transactions_per_user["membership_expire_date"]-transactions_per_user["transaction_date"]
transactions_per_user.duration=transactions_per_user.duration.dt.days
transactions_per_user.head()

Unnamed: 0_level_0,is_cancel,payment_method_id,membership_expire_date,actual_amount_paid,transaction_date,is_auto_renew,plan_list_price,paid_diff,payment_plan_days,msno,duration
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
+++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY=,0,35,2016-09-14,0,2016-09-09,0,0,0,7,1,5
+++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=,0,38,2017-01-04,1788,2015-11-21,0,1788,0,410,1,410
+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,0,41,2017-03-15,396,2016-11-16,1,396,0,120,4,119
+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,39,2017-03-19,2831,2015-01-31,1,2682,-149,543,19,778
+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,0,41,2017-03-26,3874,2015-01-26,1,3725,-149,750,26,790


In [11]:
transactions_per_user["msno_count"]=transactions_per_user["msno"]

In [12]:
payment_method_dummies = pd.get_dummies(transactions_per_user["payment_method_id"],prefix="method")
transactions_per_user.drop(["payment_method_id"],axis=1,inplace=True)
transactions_per_user=transactions_per_user.join(payment_method_dummies)
transactions_per_user.head()

Unnamed: 0_level_0,is_cancel,membership_expire_date,actual_amount_paid,transaction_date,is_auto_renew,plan_list_price,paid_diff,payment_plan_days,msno,duration,...,method_32,method_33,method_34,method_35,method_36,method_37,method_38,method_39,method_40,method_41
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
+++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY=,0,2016-09-14,0,2016-09-09,0,0,0,7,1,5,...,0,0,0,1,0,0,0,0,0,0
+++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=,0,2017-01-04,1788,2015-11-21,0,1788,0,410,1,410,...,0,0,0,0,0,0,1,0,0,0
+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,0,2017-03-15,396,2016-11-16,1,396,0,120,4,119,...,0,0,0,0,0,0,0,0,0,1
+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,2017-03-19,2831,2015-01-31,1,2682,-149,543,19,778,...,0,0,0,0,0,0,0,1,0,0
+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,0,2017-03-26,3874,2015-01-26,1,3725,-149,750,26,790,...,0,0,0,0,0,0,0,0,0,1


In [13]:
#delete dates and msno column
del transactions_per_user["msno"]
del transactions_per_user["transaction_date"]
del transactions_per_user["membership_expire_date"]
transactions_per_user.head()

Unnamed: 0_level_0,is_cancel,actual_amount_paid,is_auto_renew,plan_list_price,paid_diff,payment_plan_days,duration,msno_count,method_1,method_2,...,method_32,method_33,method_34,method_35,method_36,method_37,method_38,method_39,method_40,method_41
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
+++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY=,0,0,0,0,0,7,5,1,0,0,...,0,0,0,1,0,0,0,0,0,0
+++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=,0,1788,0,1788,0,410,410,1,0,0,...,0,0,0,0,0,0,1,0,0,0
+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,0,396,1,396,0,120,119,4,0,0,...,0,0,0,0,0,0,0,0,0,1
+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,2831,1,2682,-149,543,778,19,0,0,...,0,0,0,0,0,0,0,1,0,0
+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,0,3874,1,3725,-149,750,790,26,0,0,...,0,0,0,0,0,0,0,0,0,1


In [14]:
df_train.set_index(["msno"],inplace=True)

In [15]:
#map info to each user in the train set
df_train=df_train.join(transactions_per_user)
df_train.head()

Unnamed: 0_level_0,is_churn,city,bd,gender,registered_via,registration_init_time,expiration_date,is_cancel,actual_amount_paid,is_auto_renew,...,method_32,method_33,method_34,method_35,method_36,method_37,method_38,method_39,method_40,method_41
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1,18.0,36.0,female,9.0,2005-04-06,2017-09-07,0,149,0,...,0,0,0,0,0,0,1,0,0,0
QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=,1,10.0,38.0,male,9.0,2005-04-07,2017-03-21,1,3458,1,...,0,0,0,0,0,0,0,1,0,0
fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=,1,11.0,27.0,female,9.0,2005-10-16,2017-02-03,1,1492,1,...,0,0,0,0,0,0,0,1,0,0
mT5V8rEpa+8wuqi6x0DoVd3H5icMKkE9Prt49UlmK+4=,1,13.0,23.0,female,9.0,2005-11-02,2017-09-26,0,1788,0,...,0,0,0,0,0,0,1,0,0,0
XaPhtGLk/5UvvOYHcONTwsnH97P4eGECeq+BARGItRw=,1,3.0,27.0,male,9.0,2005-12-28,2017-09-27,0,3576,0,...,0,0,0,0,0,0,1,0,0,0


In [16]:
"""we see that there are a lot of nulls for gender, so we may opt not to include it first. We also would not need registration
and expiration date.Fill other nulls with 0
"""
del df_train["gender"]
del df_train["expiration_date"]
del df_train["registration_init_time"]
df_train.fillna(0,inplace=True)

In [17]:
#delete other dfs to free up RAM
del transactions
del train

import gc
gc.collect()

560

In [18]:
df_train.head(20)

Unnamed: 0_level_0,is_churn,city,bd,registered_via,is_cancel,actual_amount_paid,is_auto_renew,plan_list_price,paid_diff,payment_plan_days,...,method_32,method_33,method_34,method_35,method_36,method_37,method_38,method_39,method_40,method_41
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1,18.0,36.0,9.0,0,149,0,149,0,37,...,0,0,0,0,0,0,1,0,0,0
QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=,1,10.0,38.0,9.0,1,3458,1,3309,-149,663,...,0,0,0,0,0,0,0,1,0,0
fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=,1,11.0,27.0,9.0,1,1492,1,1492,0,300,...,0,0,0,0,0,0,0,1,0,0
mT5V8rEpa+8wuqi6x0DoVd3H5icMKkE9Prt49UlmK+4=,1,13.0,23.0,9.0,0,1788,0,1788,0,820,...,0,0,0,0,0,0,1,0,0,0
XaPhtGLk/5UvvOYHcONTwsnH97P4eGECeq+BARGItRw=,1,3.0,27.0,9.0,0,3576,0,3576,0,785,...,0,0,0,0,0,0,1,0,0,0
GBy8qSz16X5iYWD+3CMxv/Hm6OPSrXBYtmbnlRtknW0=,1,6.0,23.0,9.0,1,3278,1,2384,-894,480,...,0,1,0,0,0,0,0,0,0,0
lYLh7TdkWpIoQs3i3o6mIjLH8/IEgMWP9r7OpsLX0Vo=,1,13.0,29.0,9.0,0,1770,0,1770,0,330,...,0,0,0,0,1,0,0,0,0,0
T0FF6lumjKcqEO0O+tUH2ytc+Kb9EkeaLzcVUiTr1aE=,1,11.0,22.0,9.0,1,3371,1,3222,-149,633,...,0,0,0,0,0,0,0,0,1,0
Nb1ZGEmagQeba5E+nQj8VlQoWl+8SFmLZu+Y8ytIamw=,1,18.0,22.0,9.0,0,3833,0,3683,-150,720,...,0,0,0,0,1,0,0,0,0,0
MkuWz0Nq6/Oq5fKqRddWL7oh2SLUSRe3/g+XmAWqW1Q=,1,11.0,30.0,9.0,0,596,0,596,0,127,...,0,0,0,0,0,0,1,0,0,0


In [19]:
register_via_dummies = pd.get_dummies(df_train["registered_via"],prefix="registered_id")
df_train.drop(["registered_via"],axis=1,inplace=True)
df_train=df_train.join(register_via_dummies)
df_train.head()

Unnamed: 0_level_0,is_churn,city,bd,is_cancel,actual_amount_paid,is_auto_renew,plan_list_price,paid_diff,payment_plan_days,duration,...,method_38,method_39,method_40,method_41,registered_id_0.0,registered_id_3.0,registered_id_4.0,registered_id_7.0,registered_id_9.0,registered_id_13.0
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1,18.0,36.0,0,149,0,149,0,37,98,...,1,0,0,0,0,0,0,0,1,0
QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=,1,10.0,38.0,1,3458,1,3309,-149,663,780,...,0,1,0,0,0,0,0,0,1,0
fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=,1,11.0,27.0,1,1492,1,1492,0,300,367,...,0,1,0,0,0,0,0,0,1,0
mT5V8rEpa+8wuqi6x0DoVd3H5icMKkE9Prt49UlmK+4=,1,13.0,23.0,0,1788,0,1788,0,820,413,...,1,0,0,0,0,0,0,0,1,0
XaPhtGLk/5UvvOYHcONTwsnH97P4eGECeq+BARGItRw=,1,3.0,27.0,0,3576,0,3576,0,785,787,...,1,0,0,0,0,0,0,0,1,0


In [20]:
# Utility function to report best scores
from operator import itemgetter

def report_scores(grid_scores, n_top=10):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=False)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Neg Log Loss score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [21]:
df_class=df_train["is_churn"]
del df_train["is_churn"]

In [22]:
df_train["is_churn"]= df_class

In [23]:
from sklearn.cross_validation import train_test_split

temp_train, temp_test = train_test_split(df_train, train_size=0.5, random_state=35)

In [24]:
y_test = temp_test["is_churn"]
del temp_test["is_churn"]

In [25]:
del df_train["is_churn"]

In [28]:
df_train.head()

Unnamed: 0_level_0,city,bd,is_cancel,actual_amount_paid,is_auto_renew,plan_list_price,paid_diff,payment_plan_days,duration,msno_count,...,method_38,method_39,method_40,method_41,registered_id_0.0,registered_id_3.0,registered_id_4.0,registered_id_7.0,registered_id_9.0,registered_id_13.0
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,18.0,36.0,0,149,0,149,0,37,98,2,...,1,0,0,0,0,0,0,0,1,0
QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=,10.0,38.0,1,3458,1,3309,-149,663,780,23,...,0,1,0,0,0,0,0,0,1,0
fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=,11.0,27.0,1,1492,1,1492,0,300,367,10,...,0,1,0,0,0,0,0,0,1,0
mT5V8rEpa+8wuqi6x0DoVd3H5icMKkE9Prt49UlmK+4=,13.0,23.0,0,1788,0,1788,0,820,413,2,...,1,0,0,0,0,0,0,0,1,0
XaPhtGLk/5UvvOYHcONTwsnH97P4eGECeq+BARGItRw=,3.0,27.0,0,3576,0,3576,0,785,787,8,...,1,0,0,0,0,0,0,0,1,0


In [31]:
transactions_per_user.to_csv("transactions_summary_2.csv")

In [32]:
del transactions_per_user

import gc
gc.collect()

1461

In [33]:
df_class.head()

msno
waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=    1
QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=    1
fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=    1
mT5V8rEpa+8wuqi6x0DoVd3H5icMKkE9Prt49UlmK+4=    1
XaPhtGLk/5UvvOYHcONTwsnH97P4eGECeq+BARGItRw=    1
Name: is_churn, dtype: int8

In [35]:
df_train.to_csv("df_train.csv")
df_class.to_csv("df_class.csv")


In [36]:
from sklearn.neural_network import MLPClassifier

mlpc = MLPClassifier(hidden_layer_sizes=(100, 200, 100), activation='relu', solver='lbfgs', alpha=0.005, learning_rate_init = 0.001, shuffle=False) # span = 20 # best 1

mlpc.fit(df_train, df_class)
predicted_class = mlpc.predict(temp_test)

#get the confusion matrix of the prediction
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn import metrics

print("Classification report for classifier %s:\n%s\n"
      % (logreg, metrics.classification_report(y_test, predicted_class)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, predicted_class))



MemoryError: 

In [30]:
import gc
gc.collect()


307

In [None]:
from sklearn.externals import joblib
joblib.dump(random_search.best_estimator_, 'multilayer1.pkl')

In [None]:
report_scores(random_search.grid_scores_)

In [None]:
test.set_index(["msno"],inplace=True)

In [None]:
del test["is_churn"]
test=(test.join(transactions_per_user))

In [None]:
members.set_index(["msno"],inplace=True)

In [None]:
test=(test.join(members))

In [None]:
del test["registration_init_time"]
del test["expiration_date"]
del test["gender"]
test.head()

In [None]:
test= test[["city","bd","registered_via","is_auto_renew","plan_list_price","paid_diff","actual_amount_paid","is_cancel","payment_plan_days","duration","msno_count"]]
test.head()

In [None]:
test.fillna(0,inplace=True)
test["predict"]=random_search.best_estimator_.predict_proba(test)[:,1]


In [None]:
test["msno"]=test.index
test["is_churn"]=test.predict
test.head(40)
submission = pd.DataFrame(index=test.index)
submission.index = range(len(submission))
submission=test[["msno","is_churn"]]
submission.index = range(len(submission))
submission

In [None]:
submission.to_csv("submission3.csv",index=False)