## Data Preparation

In [1]:
""" fits the data from the pipeline
# Besides, we defined our own preprocessing functions : 
#       normalize() to process data that arent' with the same order of magnitude, 
#       nan() to drop NaN values
#       multiple_format() to one-hot encode our categorical features
""" 
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from xgboost.sklearn import XGBClassifier
import pickle
%matplotlib inline

In [2]:
def read_data(path):
    data = pd.read_csv(path,
                            infer_datetime_format=True,
                            on_bad_lines='warn',
                            skip_blank_lines=True)
    try:
        df = data.sort_index()
        df = df.set_index("SK_ID_CURR")
    except:
        print("Unexpected error:", sys.exc_info()[0])
    print('\n', df.dtypes)
    return df

In [3]:
def nan(df):
    print("Process Nan...")
    df_numeric = df.select_dtypes(include=[np.number])
    numeric_cols = df_numeric.columns.values
    for col in numeric_cols:
        pct_missing = np.mean(df[col].isnull())
        print('{} - {}%'.format(col, round(pct_missing*100)))
        if (pct_missing < 4):                                             #* if NaN < 4% : replace by median
            med = df[col].median()
            df[col] = df[col].fillna(med)
    df_non_numeric = df.select_dtypes(exclude=[np.number])              #* Repeat process with non numerics variables
    non_numeric_cols = df_non_numeric.columns.values
    for col in non_numeric_cols:
        pct_missing = np.mean(df[col].isnull())
        print('{} - {}%'.format(col, round(pct_missing*100)))
        if pct_missing < 4:
            med = df[col].median()
            df[col] = df[col].fillna(med)
    print(df.shape)
    return df

def fix_typos(df):
    print("Fixing Typos...")
    obj = [col  for col, dt in df.dtypes.items() if dt == object]
    for col in obj:
        df[obj] = df[obj].str.replace(',', '.')
        df[obj] = df[obj].str.upper()
        df[obj] = df[obj].str.strip()
    print(df.shape)
    return df

def multiple_format(df, mult_var=None):                                 #* mult_var is a list
    print("Encoding categorical varible(s)...")
    if mult_var is not None:
        df = pd.get_dummies(data=df, columns=mult_var)
    print(df.shape)
    return df

def normalization(df):
    scaler = MinMaxScaler()
    scaler.fit_transform(df)
    return df

def suppressOutliers(df):
    clf = IsolationForest(random_state=42)
    param_grid = {'n_estimators': list(range(100, 1000, 10)), 
                'contamination': [0.005, 0.01, 0.02, 0.03, 0.05, 0.06, 0.07, 0.08], 
                'bootstrap': [True, False]}        

    grid_isol = RandomizedSearchCV(clf, 
                                    param_grid,
                                    scoring=custom_silhouette,              #? Davies Bouldin Score     or      Silhouette Score  
                                    refit=True,
                                    cv=3, 
                                    return_train_score=True)
    best_model = grid_isol.fit(df.values)
    custom_silhouette(best_model, df.values)
    custom_DBScrore(best_model, df.values)
    print('Optimum parameters', best_model.best_params_)
    y_pred = best_model.predict(df.values)
    train_clustered = df.assign(Cluster=y_pred)
    train_clustered = train_clustered.replace({-1: "Anomaly", 1: "Regular"})
    train_clustered["Cluster"].value_counts()
    # TO DO return value

def custom_silhouette(estimator, X):
      print("{}   -     ".format(round(silhouette_score(X, estimator.predict(X)), 4)), end = '')
      return np.mean(silhouette_score(X, estimator.predict(X)))

def custom_DBScrore(estimator, X):
      print(round(davies_bouldin_score(X, estimator.predict(X)), 4))
      return np.mean(davies_bouldin_score(X, estimator.predict(X)))
      
def data_prep(df, filename, mult_var=None):
    df = df.drop_duplicates(keep='last')            #* Keep only most recent duplicatas
    df = pd.get_dummies(data=df, columns=mult_var)
    df = nan(df)   
    df.to_csv("../data/interim/" + filename)                                 #* Process empty values based on several conditions
    df = normalization(df) 
    df = df.convert_dtypes()                        #* Assign good type for the modelling phase
    df = df.select_dtypes(exclude=['object'])       #* Remove Object and String columns who are irrelevant
    return df

In [21]:
cat_features = open("../data/features/cat_features.txt", "r")
cat_features

<_io.TextIOWrapper name='../data/features/cat_features.txt' mode='r' encoding='cp1252'>

In [26]:
def extract_processed_data():
    """
    """
    path = r'..\data\raw\application_train.csv'
    path_test = r'..\data\raw\application_test.csv'
    train, test = read_data(path), read_data(path_test)
    #y = train.TARGET
    X_train, X_test = train.iloc[:, 1:240], test.iloc[:, 1:240]
    #y_train, y_test = train.TARGET, train.TARGET
    cat_features = pickle.load(open("../data/features/cat_features.pkl",'rb'))
    num_features = pickle.load(open("../data/features/num_features.pkl",'rb'))
    pipeline = pickle.load(open('../models/pipe.pkl', 'rb'))
    #x_train =train.drop('TARGET', axis=1)
    ##### Extract preprocessed data right before normalizing to visualise later on
    data_cleaned = data_prep(train,mult_var=cat_features, filename="train_before_normalisation.csv")
    data_cleaned2 = data_prep(test,mult_var =cat_features, filename="test_before_normalisation.csv")

    # apply the pipeline to the training and test data
    print('preprocessing...')
    x_train_ = pipeline.named_steps["preprocessing"].fit_transform(X_train)
    x_test_ = pipeline.named_steps["preprocessing"].fit_transform(X_test)
    x_train_ = pd.DataFrame(x_train_)
    x_test_ = pd.DataFrame(x_test_)
    x_train_.to_csv("../data/processed/application_train.csv")
    x_test_.to_csv("../data/processed/application_test.csv")

extract_processed_data()


 TARGET                          int64
NAME_CONTRACT_TYPE             object
CODE_GENDER                    object
FLAG_OWN_CAR                   object
FLAG_OWN_REALTY                object
                               ...   
AMT_REQ_CREDIT_BUREAU_DAY     float64
AMT_REQ_CREDIT_BUREAU_WEEK    float64
AMT_REQ_CREDIT_BUREAU_MON     float64
AMT_REQ_CREDIT_BUREAU_QRT     float64
AMT_REQ_CREDIT_BUREAU_YEAR    float64
Length: 121, dtype: object

 NAME_CONTRACT_TYPE             object
CODE_GENDER                    object
FLAG_OWN_CAR                   object
FLAG_OWN_REALTY                object
CNT_CHILDREN                    int64
                               ...   
AMT_REQ_CREDIT_BUREAU_DAY     float64
AMT_REQ_CREDIT_BUREAU_WEEK    float64
AMT_REQ_CREDIT_BUREAU_MON     float64
AMT_REQ_CREDIT_BUREAU_QRT     float64
AMT_REQ_CREDIT_BUREAU_YEAR    float64
Length: 120, dtype: object
Process Nan...
TARGET - 0%
CNT_CHILDREN - 0%
AMT_INCOME_TOTAL - 0%
AMT_CREDIT - 0%
AMT_ANNUITY - 0%
AMT_GO

ValueError: A given column is not a column of the dataframe