In [1]:
import pandas as pd
import pickle
from joblib import load

from flask import Flask, jsonify, request, jsonify, render_template
import json

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

from imblearn.under_sampling  import RandomUnderSampler

from lightgbm import LGBMClassifier


In [2]:
data_train=pd.read_csv("app_train.csv")
data_test=pd.read_csv("app_test.csv")

In [3]:
#Feature engineering fonction 

def features_engineering(data_train, data_test):

    # Cette fonction regroupe toutes les opérations de features engineering
    # mises en place sur les sets train & test

    #############################################
    # LABEL ENCODING
    #############################################
    # Create a label encoder object
    le = LabelEncoder()
    le_count = 0

    # Iterate through the columns
    for col in data_train:
        if data_train[col].dtype == 'object':
            # If 2 or fewer unique categories
            if len(list(data_train[col].unique())) <= 2:
                # Train on the training data
                le.fit(data_train[col])
                # Transform both training and testing data
                data_train[col] = le.transform(data_train[col])
                data_test[col] = le.transform(data_test[col])
                
                # Keep track of how many columns were label encoded
                le_count += 1

    ############################################
    # ONE HOT ENCODING
    ############################################
    # one-hot encoding of categorical variables
    data_train = pd.get_dummies(data_train)
    data_test = pd.get_dummies(data_test)

    train_labels = data_train['TARGET']
    # Align the training and testing data, keep only columns present in both dataframes
    data_train, data_test = data_train.align(data_test, join = 'inner', axis = 1)
    # Add the target back in
    data_train['TARGET'] = train_labels

    return data_train, data_test




# Preprocessing function
def preprocesseur(df_train, df_test):
    
    # Cette fonction permet d'imputer les valeurs manquantes dans
    # chaque dataset et aussi d'appliquer un MinMaxScaler

    # Drop the target from the training data
    if "TARGET" in df_train:
        train = df_train.drop(columns = ["TARGET"])
    else:
        train = df_train.copy()
        
    # Feature names
    features = list(train.columns)


    # Median imputation of missing values
    imputer = SimpleImputer(strategy = 'median')

    # Scale each feature to 0-1
    scaler = MinMaxScaler(feature_range = (0, 1))


    # Fit on the training data
    imputer.fit(train)

    # Transform both training and testing data
    train = imputer.transform(train)
    test = imputer.transform(df_test)

    # Repeat with the scaler
    scaler.fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)
    
    return train, test




target=data_train.copy()

def data_resampler(df_train, target):

    rsp = RandomUnderSampler()
    X_rsp, y_rsp = rsp.fit_resample(df_train, target["TARGET"])

    return X_rsp, y_rsp



def entrainement_LightBoost(X, y):

    # Configuration de la meilleure itération trouvée 
    # Optimized n_estimator=600
    clf_lgbm = LGBMClassifier(colsample_bytree=0.600170715692459, 
                              #device='gpu',
                              learning_rate=0.02975841167356727, 
                              max_depth=7, 
                              n_estimators=600,
                              reg_lambda=2.156064880286573, 
                              subsample=0.629809210489369)

    clf_lgbm.fit(X, y)

    return clf_lgbm
    

In [4]:

def entrainement_knn(df):

    print("En cours...")
    knn = NearestNeighbors(n_neighbors=10, algorithm='auto').fit(df)

    return knn 


# On crée deux variables en attente qui deviendront
# des variables globales après l'initialisation de l'API.
# Ces variables sont utilisées dans plusieurs fonctions de l'API.
train = None
test = None
model = None

# On crée la liste des ID clients qui nous servira dans l'API
id_client = data_test['SK_ID_CURR'][:50].values
id_client = pd.DataFrame(id_client)



import warnings
warnings.filterwarnings("ignore")

In [5]:
##On prépare les données
df_train, df_test = features_engineering(data_train, data_test)
df_train=df_train.drop(labels="Unnamed: 0",axis=1)
df_test=df_test.drop(labels="Unnamed: 0",axis=1)

print("Features engineering done")

Features engineering done


In [6]:
# On fait le préprocessing des données
df_train, df_test = preprocesseur(df_train, df_test)

In [7]:
X, y = data_resampler(df_train, data_train)
print("Resampling done")


Resampling done


In [8]:
train = df_train.copy()
test=df_test.copy()

In [9]:
clf_lgbm= entrainement_LightBoost(X, y)
print("Training LightBoost done")

Training LightBoost done


In [10]:
pickle.dump(clf_lgbm, open('clf_lgbm.pkl', 'wb'))

In [11]:
def entrainement_knn(df):

    print("En cours...")
    knn = NearestNeighbors(n_neighbors=10, algorithm='auto').fit(df)

    return knn 

In [14]:
knn=entrainement_knn(data_client)

En cours...


In [15]:
pickle.dump(knn, open('knn.pkl', 'wb'))

In [16]:
pickled_knn=pickle.load(open('knn.pkl', 'rb'))

In [18]:
distances, indices = pickled_knn.kneighbors(data_client)

In [19]:
distances

array([[8.42936970e-08, 2.59287451e+00, 3.06330609e+00, 3.22660550e+00,
        3.32752856e+00, 3.32837667e+00, 3.33338760e+00, 3.33646350e+00,
        3.34633217e+00, 3.35267862e+00],
       [8.42936970e-08, 2.36190426e+00, 2.60694108e+00, 2.76236126e+00,
        2.97466805e+00, 2.97576447e+00, 3.04248572e+00, 3.10864689e+00,
        3.24461754e+00, 3.33728310e+00],
       [8.42936970e-08, 3.27418180e+00, 3.28418272e+00, 3.39216220e+00,
        3.39350097e+00, 3.46019769e+00, 3.53058578e+00, 3.53109150e+00,
        3.58738800e+00, 3.68420328e+00],
       [0.00000000e+00, 2.93046608e+00, 3.36720198e+00, 3.38635841e+00,
        3.50418903e+00, 3.53430777e+00, 3.63002012e+00, 3.64383377e+00,
        3.69338972e+00, 3.79023554e+00],
       [1.19209290e-07, 2.63626590e+00, 2.93091285e+00, 2.97576447e+00,
        3.10198457e+00, 3.15121896e+00, 3.16006354e+00, 3.16457044e+00,
        3.18451489e+00, 3.35044823e+00],
       [0.00000000e+00, 3.44974002e+00, 3.72479916e+00, 3.85538897e+00,
   

In [20]:
 id = 100001

print("Analyse data_test :")
print(data_test.shape)
print(data_test[data_test["SK_ID_CURR"] == int(id)])
     
index = data_test[data_test["SK_ID_CURR"] == int(id)].index.values

print(index[0])
print(test)    

Analyse data_test :
(50, 122)
   Unnamed: 0  SK_ID_CURR  NAME_CONTRACT_TYPE  CODE_GENDER  FLAG_OWN_CAR  \
0           0      100001                   0            0             0   

   FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
0                1             0          135000.0    568800.0      20560.5   

   ...  FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21  \
0  ...                 0                0                0                0   

  AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY  \
0                        0.0                       0.0   

   AMT_REQ_CREDIT_BUREAU_WEEK  AMT_REQ_CREDIT_BUREAU_MON  \
0                         0.0                        0.0   

   AMT_REQ_CREDIT_BUREAU_QRT  AMT_REQ_CREDIT_BUREAU_YEAR  
0                        0.0                         0.0  

[1 rows x 122 columns]
0
[[-8.59771301e-05  0.00000000e+00  0.00000000e+00 ...  1.00000000e+00
   0.00000000e+00  1.00000000e+00]
 [ 2.5793139

In [21]:
data_client = test

print(data_client)

[[-8.59771301e-05  0.00000000e+00  0.00000000e+00 ...  1.00000000e+00
   0.00000000e+00  1.00000000e+00]
 [ 2.57931390e-04  0.00000000e+00  1.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 9.45748431e-04  0.00000000e+00  1.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 ...
 [ 3.26713094e-02  0.00000000e+00  1.00000000e+00 ...  1.00000000e+00
   0.00000000e+00  1.00000000e+00]
 [ 3.28432637e-02  0.00000000e+00  0.00000000e+00 ...  1.00000000e+00
   0.00000000e+00  1.00000000e+00]
 [ 3.29292408e-02  0.00000000e+00  1.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]


In [37]:
pickle.dump(data_client, open('data_client.pkl', 'wb'))

In [22]:
d=pickle.load(open('data_client.pkl', 'rb'))

In [23]:
pickled_model = pickle.load(open('clf_lgbm.pkl', 'rb'))

In [24]:
prediction = pickled_model.predict_proba(d)

In [45]:
pickle.dump(prediction, open('prediction.pkl', 'wb'))

In [25]:
data_prediction=pickle.load(open('prediction.pkl', 'rb'))

In [31]:
id=100005

In [30]:
data_test

Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,0,100001,0,0,0,1,0,135000.0,568800.0,20560.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,100005,0,1,0,1,0,99000.0,222768.0,17370.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,2,100013,0,1,1,1,0,202500.0,663264.0,69777.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,3,100028,0,0,0,1,2,315000.0,1575000.0,49018.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,4,100038,0,1,1,0,1,180000.0,625500.0,32067.0,...,0,0,0,0,,,,,,
5,5,100042,0,0,1,1,0,270000.0,959688.0,34600.5,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,2.0
6,6,100057,0,1,1,1,2,180000.0,499221.0,22117.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
7,7,100065,0,1,0,1,0,166500.0,180000.0,14220.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
8,8,100066,0,0,0,1,0,315000.0,364896.0,28957.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,5.0
9,9,100067,0,0,1,1,1,162000.0,45000.0,5337.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0


In [36]:
pickle.dump(data_test, open('data_test.pkl', 'wb'))

In [37]:
data_test_pickle=pickle.load(open('data_test.pkl', 'rb'))

In [38]:
index=data_test_pickle[data_test_pickle["SK_ID_CURR"] == int(id)].index.values

In [39]:
index

array([1], dtype=int64)

In [34]:
prediction1 = data_prediction[index].tolist()

print(data_prediction)

[[0.94840613 0.05159387]
 [0.77757742 0.22242258]
 [0.9190166  0.0809834 ]
 [0.59234585 0.40765415]
 [0.17500526 0.82499474]
 [0.90738176 0.09261824]
 [0.88334999 0.11665001]
 [0.84064756 0.15935244]
 [0.97867659 0.02132341]
 [0.60762692 0.39237308]
 [0.52310621 0.47689379]
 [0.82774736 0.17225264]
 [0.46104173 0.53895827]
 [0.90368248 0.09631752]
 [0.61842759 0.38157241]
 [0.387104   0.612896  ]
 [0.90524646 0.09475354]
 [0.77304412 0.22695588]
 [0.40274843 0.59725157]
 [0.91070776 0.08929224]
 [0.97932855 0.02067145]
 [0.98665541 0.01334459]
 [0.40700379 0.59299621]
 [0.77347358 0.22652642]
 [0.72409366 0.27590634]
 [0.42865791 0.57134209]
 [0.44920133 0.55079867]
 [0.86529587 0.13470413]
 [0.74779614 0.25220386]
 [0.58366474 0.41633526]
 [0.96771386 0.03228614]
 [0.90210586 0.09789414]
 [0.0910349  0.9089651 ]
 [0.91065043 0.08934957]
 [0.88440188 0.11559812]
 [0.37015183 0.62984817]
 [0.97909669 0.02090331]
 [0.47001585 0.52998415]
 [0.93517989 0.06482011]
 [0.18641092 0.81358908]


In [35]:
print(prediction1)

[[0.7775774234452618, 0.2224225765547382]]
