In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier

In [2]:
cl_train = pd.read_csv("client_train.csv",low_memory=False)
inv_train = pd.read_csv("train_invoice.csv",low_memory=False)
cl_test = pd.read_csv("client_test.csv",low_memory=False)
inv_test = pd.read_csv("test_invoice.csv",low_memory=False)

In [3]:
def featengg(cl, inv):
    cl['client_catg'] = cl['client_catg'].astype('category')
    cl['disrict'] = cl['disrict'].astype('category')
    cl['region'] = cl['region'].astype('category')
    cl['region_group'] = cl['region'].apply(lambda x: 100 if x<100 else 300 if x>300 else 200)
    cl['creation_date'] = pd.to_datetime(cl['creation_date'])
    inv['counter_type'] = inv['counter_type'].map({"ELEC":1,"GAZ":0})
    inv['invoice_date'] = pd.to_datetime(inv['invoice_date'], dayfirst=True)
    inv['invoice_month'] = inv['invoice_date'].dt.month
    inv['invoice_year'] = inv['invoice_date'].dt.year
    inv['index_diff'] = inv['new_index'] - inv['old_index']
    return cl, inv

In [4]:
cl_train, inv_train = featengg(cl_train, inv_train)
cl_test, inv_test = featengg(cl_test, inv_test)

  cl['creation_date'] = pd.to_datetime(cl['creation_date'])
  inv['invoice_date'] = pd.to_datetime(inv['invoice_date'], dayfirst=True)
  cl['creation_date'] = pd.to_datetime(cl['creation_date'])
  inv['invoice_date'] = pd.to_datetime(inv['invoice_date'], dayfirst=True)


In [5]:
def agg_feature(invoice, client_df, agg_stat):
    full_df = pd.merge(client_df, invoice, on='client_id', how='left')
    return full_df

In [6]:
agg_stat_columns = ['tarif_type','counter_number','counter_statue','counter_code','reading_remarque','consommation_level_1','consommation_level_2',
'consommation_level_3', 'consommation_level_4','old_index','new_index','months_number','counter_type', 'invoice_month','invoice_year','index_diff']

train = agg_feature(inv_train, cl_train, agg_stat_columns)
test = agg_feature(inv_test, cl_test, agg_stat_columns)

In [7]:
train

Unnamed: 0,disrict,client_id,client_catg,region,creation_date,target,region_group,invoice_date,tarif_type,counter_number,...,consommation_level_2,consommation_level_3,consommation_level_4,old_index,new_index,months_number,counter_type,invoice_month,invoice_year,index_diff
0,60.0,train_Client_0,11.0,101.0,1994-12-31,0.0,200,2014-03-24,11,1335667,...,0,0,0,14302,14384,4,1,3,2014,82
1,60.0,train_Client_0,11.0,101.0,1994-12-31,0.0,200,2013-03-29,11,1335667,...,184,0,0,12294,13678,4,1,3,2013,1384
2,60.0,train_Client_0,11.0,101.0,1994-12-31,0.0,200,2015-03-23,11,1335667,...,0,0,0,14624,14747,4,1,3,2015,123
3,60.0,train_Client_0,11.0,101.0,1994-12-31,0.0,200,2015-07-13,11,1335667,...,0,0,0,14747,14849,4,1,7,2015,102
4,60.0,train_Client_0,11.0,101.0,1994-12-31,0.0,200,2016-11-17,11,1335667,...,0,0,0,15066,15638,12,1,11,2016,572
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3279690,63.0,train_Client_67499,11.0,313.0,2010-09-15,0.0,300,2018-06-25,11,6301121,...,0,0,0,143382,143784,4,1,6,2018,402
3279691,63.0,train_Client_67499,11.0,313.0,2010-09-15,0.0,300,2017-02-28,11,6301121,...,0,0,0,142871,143042,4,1,2,2017,171
3279692,63.0,train_Client_67499,11.0,313.0,2010-09-15,0.0,300,2016-01-11,11,6301121,...,0,0,0,142820,142871,2,1,1,2016,51
3279693,63.0,train_Client_67499,11.0,313.0,2010-09-15,0.0,300,2015-03-11,11,6301121,...,0,0,0,141762,142200,4,1,3,2015,438


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3279695 entries, 0 to 3279694
Data columns (total 25 columns):
 #   Column                Dtype         
---  ------                -----         
 0   disrict               category      
 1   client_id             object        
 2   client_catg           category      
 3   region                category      
 4   creation_date         datetime64[ns]
 5   target                float64       
 6   region_group          int64         
 7   invoice_date          datetime64[ns]
 8   tarif_type            int64         
 9   counter_number        int64         
 10  counter_statue        object        
 11  counter_code          int64         
 12  reading_remarque      int64         
 13  counter_coefficient   int64         
 14  consommation_level_1  int64         
 15  consommation_level_2  int64         
 16  consommation_level_3  int64         
 17  consommation_level_4  int64         
 18  old_index             int64         
 19  

In [9]:
X=train.drop(['client_id', 'creation_date', 'target', 'invoice_date', 'counter_statue'], axis=1)
y=train['target']

In [10]:
test

Unnamed: 0,disrict,client_id,client_catg,region,creation_date,region_group,invoice_date,tarif_type,counter_number,counter_statue,...,consommation_level_2,consommation_level_3,consommation_level_4,old_index,new_index,months_number,counter_type,invoice_month,invoice_year,index_diff
0,62,train_Client_675,11,301,2017-04-22,300,2018-03-14,11,2165702163000,0,...,0,0,0,211,818,4,1,3,2018,607
1,62,train_Client_675,11,301,2017-04-22,300,2017-11-14,11,2165702163000,0,...,0,0,0,209,211,4,1,11,2017,2
2,62,train_Client_675,11,301,2017-04-22,300,2019-03-14,11,2165702163000,0,...,0,0,0,1706,2395,4,1,3,2019,689
3,62,train_Client_675,11,301,2017-04-22,300,2018-10-07,11,2165702163000,0,...,0,0,0,818,1030,4,1,10,2018,212
4,62,train_Client_675,11,301,2017-04-22,300,2018-11-15,11,2165702163000,0,...,0,0,0,1030,1706,4,1,11,2018,676
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1197049,60,train_Client_99998,11,101,1993-12-22,200,2005-08-19,10,1253571,0,...,135,0,0,3197,3732,8,1,8,2005,535
1197050,60,train_Client_99998,11,101,1993-12-22,200,2005-12-19,10,1253571,0,...,6,0,0,3732,3938,4,1,12,2005,206
1197051,60,train_Client_99999,11,101,1986-02-18,200,1996-09-25,11,560948,0,...,0,0,0,13884,14143,4,1,9,1996,259
1197052,60,train_Client_99999,11,101,1986-02-18,200,1996-05-28,11,560948,0,...,0,0,0,13281,13884,4,1,5,1996,603


In [11]:

x_test=test.drop(['client_id', 'creation_date', 'invoice_date', 'counter_statue'], axis=1)

In [12]:
model = LGBMClassifier(random_state=42, n_estimators=800,num_leaves=450, max_depth=60,learning_rate=0.007)
model.fit(X,y)

[LightGBM] [Info] Number of positive: 260840, number of negative: 3018855
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2247
[LightGBM] [Info] Number of data points in the train set: 3279695, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079532 -> initscore=-2.448726
[LightGBM] [Info] Start training from score -2.448726


In [13]:
pred=model.predict(x_test)

In [14]:
test['target']=pred
test

Unnamed: 0,disrict,client_id,client_catg,region,creation_date,region_group,invoice_date,tarif_type,counter_number,counter_statue,...,consommation_level_3,consommation_level_4,old_index,new_index,months_number,counter_type,invoice_month,invoice_year,index_diff,target
0,62,train_Client_675,11,301,2017-04-22,300,2018-03-14,11,2165702163000,0,...,0,0,211,818,4,1,3,2018,607,0.0
1,62,train_Client_675,11,301,2017-04-22,300,2017-11-14,11,2165702163000,0,...,0,0,209,211,4,1,11,2017,2,0.0
2,62,train_Client_675,11,301,2017-04-22,300,2019-03-14,11,2165702163000,0,...,0,0,1706,2395,4,1,3,2019,689,0.0
3,62,train_Client_675,11,301,2017-04-22,300,2018-10-07,11,2165702163000,0,...,0,0,818,1030,4,1,10,2018,212,0.0
4,62,train_Client_675,11,301,2017-04-22,300,2018-11-15,11,2165702163000,0,...,0,0,1030,1706,4,1,11,2018,676,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1197049,60,train_Client_99998,11,101,1993-12-22,200,2005-08-19,10,1253571,0,...,0,0,3197,3732,8,1,8,2005,535,0.0
1197050,60,train_Client_99998,11,101,1993-12-22,200,2005-12-19,10,1253571,0,...,0,0,3732,3938,4,1,12,2005,206,0.0
1197051,60,train_Client_99999,11,101,1986-02-18,200,1996-09-25,11,560948,0,...,0,0,13884,14143,4,1,9,1996,259,0.0
1197052,60,train_Client_99999,11,101,1986-02-18,200,1996-05-28,11,560948,0,...,0,0,13281,13884,4,1,5,1996,603,0.0


In [15]:
pred


array([0., 0., 0., ..., 0., 0., 0.])

In [16]:
submission = pd.DataFrame({
        "client_id": test["client_id"],
        "target": pred  })
submission.to_csv('submissionlast.csv', index=False)