# **IMPORT DATA**

In [1]:
# import data from google drive :) 
 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", 160)  # to display all columns of our data 
pd.set_option("display.max_rows", 160)  # to display all columns of our data 

from sklearn.ensemble import RandomForestRegressor  # modeling 
from sklearn.metrics import make_scorer, mean_squared_error  #scoring
from sklearn.model_selection import train_test_split  
from sklearn.model_selection import cross_val_score

import warnings
warnings.simplefilter('ignore')

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [27]:
path = '/content/drive/My Drive/Tunisian_Fraud_Detection/SUPCOM_HACK_data/'

train = pd.read_csv(path +'train.csv')
test = pd.read_csv(path +'test.csv')
Submission = pd.read_csv(path +'SampleSubmission.csv')

In [28]:
train.shape , test.shape

((21295, 121), (7517, 120))

In [29]:
testID = test.id


# **Feature Engineering**

In [5]:
data=pd.concat([train,test],axis=0)

In [6]:
# Label encoder : 

from sklearn.preprocessing import LabelEncoder
data['CTR_CATEGO_X'] = LabelEncoder().fit_transform(data['CTR_CATEGO_X'])
data['id'] = LabelEncoder().fit_transform(data['id'])

In [7]:
col_todrop=data.loc[:, data.var() == 0.0].columns

In [8]:
data.drop(col_todrop,1,inplace = True)

In [9]:
a=data.groupby('BCT_CODBUR')['SND_MNTPAY_I'].mean()
data['import_mean'] = data['BCT_CODBUR'].map(a)
a=data.groupby('BCT_CODBUR')['TVA_CAFEXO'].mean()
data['REVENUE_mean'] = data['BCT_CODBUR'].map(a)
a=data.groupby('BCT_CODBUR')['TVA_CRDINI'].mean()
data['CRD_mean'] = data['BCT_CODBUR'].map(a)

In [10]:
train=data.iloc[:21295,:]
test=data.iloc[21295:,:]

In [11]:
test.drop('target',1,inplace = True)
cols=test.columns

In [12]:
To_convert_to_int = ['CTR_OFODEP','CTR_OFODET','CTR_OBLAUT','CTR_OBLASS','CTR_ODTIMB','CTR_OBLTCL','CTR_OBLTHO','CTR_OBLDLI','CTR_OBLTVI',]
train[To_convert_to_int] = train[To_convert_to_int].fillna(-1)
train[To_convert_to_int] = train[To_convert_to_int].astype('int16')

test[To_convert_to_int] = test[To_convert_to_int].fillna(-1)
test[To_convert_to_int] = test[To_convert_to_int].astype('int16')

In [13]:
train  = train.reset_index(drop=True)
test  = test.reset_index(drop=True)

In [14]:
target = train.target
testID = test.id
train.drop(['id','target'],1,inplace=True)
test.drop(['id',],1,inplace=True)

# **LGBM**

In [16]:
import lightgbm as lgb

categ_features = ['BCT_CODBUR','CTR_MATFIS','FJU_CODFJU','CTR_CESSAT','ACT_CODACT','CTR_OBLDIR','CTR_OBLACP','CTR_OBLRES',
                  
                  'CTR_OBLFOP','CTR_OBLTFP','CTR_OBLDCO','CTR_OBLTVA','CTR_OFODEP','CTR_OFODET','CTR_OBLAUT','CTR_OBLASS',
                  
                  'CTR_ODTIMB','CTR_OBLTCL','CTR_OBLTHO','CTR_OBLDLI','CTR_OBLTVI',]

trn_data = lgb.Dataset(data=train ,label=target ,categorical_feature=categ_features)

In [17]:
params ={'colsample_bytree':0.85,'learning_rate':0.03,'max_depth':8,'n_estimators':3000,'num_leaves':150,
                               'silent':False,'metric':'rmse','objective':'regression'}

In [18]:
# Train model with training data
model = lgb.train(params, trn_data, )

## **Predict - LGBM**

In [24]:
LGBM_PREDS = np.clip(model.predict(test,num_iteration=model.best_iteration) , a_min=0 , a_max=100) 

# **XGBOOST**

In [19]:
%%time
import xgboost as xgb 

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.8,subsample=0.9,min_child_weight=4, learning_rate = 0.03,
                max_depth = 6,   n_estimators = 2000 ,silent=False)
xg_reg.fit(train,target )

CPU times: user 4min 10s, sys: 147 ms, total: 4min 10s
Wall time: 4min 10s


## **Predict Xgboost**

In [23]:
XGB_PREDS = np.clip(xg_reg.predict(test) , a_min=0 , a_max=100) 

In [None]:
len(XGB_PREDS)

7517

# **Catboost**

In [20]:
!pip install catboost -q

[K     |████████████████████████████████| 66.1MB 66kB/s 
[?25h

In [21]:
from catboost import CatBoostRegressor , Pool

Catb_model = CatBoostRegressor(random_seed=42,iterations=2000,learning_rate=0.079651)

Catb_model.fit(Pool(train,target ,cat_features = categ_features) ,
               
               verbose=100)

0:	learn: 6.9849972	total: 150ms	remaining: 5m
100:	learn: 5.7283631	total: 9.69s	remaining: 3m 2s
200:	learn: 5.5649509	total: 19.3s	remaining: 2m 52s
300:	learn: 5.4519558	total: 29s	remaining: 2m 43s
400:	learn: 5.3528032	total: 38.8s	remaining: 2m 34s
500:	learn: 5.2675920	total: 48.5s	remaining: 2m 25s
600:	learn: 5.1898382	total: 58.3s	remaining: 2m 15s
700:	learn: 5.1253117	total: 1m 8s	remaining: 2m 6s
800:	learn: 5.0673777	total: 1m 17s	remaining: 1m 56s
900:	learn: 5.0157026	total: 1m 27s	remaining: 1m 46s
1000:	learn: 4.9633383	total: 1m 37s	remaining: 1m 36s
1100:	learn: 4.9133126	total: 1m 46s	remaining: 1m 27s
1200:	learn: 4.8607931	total: 1m 56s	remaining: 1m 17s
1300:	learn: 4.8089150	total: 2m 6s	remaining: 1m 7s
1400:	learn: 4.7572023	total: 2m 16s	remaining: 58.2s
1500:	learn: 4.7117699	total: 2m 25s	remaining: 48.5s
1600:	learn: 4.6718346	total: 2m 35s	remaining: 38.8s
1700:	learn: 4.6321753	total: 2m 45s	remaining: 29.1s
1800:	learn: 4.5926936	total: 2m 55s	remaini

<catboost.core.CatBoostRegressor at 0x7f6ab1ff4d30>

## **Predict catboost**

In [22]:
Catb_preds = np.clip( Catb_model.predict(test ) ,a_min=0 ,a_max=100)

In [None]:
len(Catb_preds)

7517

# **Create a submission**

In [34]:
Blend_preds = Catb_preds*0.4 + XGB_PREDS*0.08 + LGBM_PREDS*0.52

In [35]:
Submission_BLEND = pd.DataFrame()
Submission_BLEND['client_id'] = testID
Submission_BLEND['target'] = np.clip(Blend_preds ,a_min =0, a_max=100)

In [36]:
print('min' , Submission_BLEND['target'].min() )
print('max' , Submission_BLEND['target'].max() )
print('mean : ' ,Submission_BLEND['target'].mean())

min 0.0
max 22.738951037376097
mean :  11.804381289256543


In [37]:
Submission_BLEND.to_csv('Submission_BLEND_OK_ALL_DATA__2.csv',index=False)

In [38]:
Submission_BLEND

Unnamed: 0,client_id,target
0,test_id1120,16.663361
1,test_id1680,17.160534
2,test_id1063,10.202504
3,test_id3731,16.033032
4,test_id9766,15.404991
...,...,...
7512,test_id1497,14.292446
7513,test_id10025,9.251444
7514,test_id1045,16.618685
7515,test_id10695,12.343702
