## Export ou pipeline : preprocessing and XGBClassifier

In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost.sklearn import XGBClassifier
import pickle
import sys

In [2]:
# Some such as default would be binary features, but since
# they have a third class "unknown" we'll process them as non binary categorical
num_features = ["CNT_CHILDREN",	"AMT_INCOME_TOTAL",	"AMT_CREDIT",	"AMT_ANNUITY",	"AMT_GOODS_PRICE", "REGION_POPULATION_RELATIVE",	"DAYS_BIRTH",	"DAYS_EMPLOYED",
	"DAYS_REGISTRATION",	"DAYS_ID_PUBLISH",	"OWN_CAR_AGE",	"FLAG_MOBIL",	"FLAG_EMP_PHONE",	"FLAG_WORK_PHONE",	"FLAG_CONT_MOBILE",	"FLAG_PHONE",	"FLAG_EMAIL",
    "CNT_FAM_MEMBERS",	"REGION_RATING_CLIENT",	"REGION_RATING_CLIENT_W_CITY", "HOUR_APPR_PROCESS_START",	"REG_REGION_NOT_LIVE_REGION",	"REG_REGION_NOT_WORK_REGION",	
    "LIVE_REGION_NOT_WORK_REGION",	"REG_CITY_NOT_LIVE_CITY","REG_CITY_NOT_WORK_CITY",	"LIVE_CITY_NOT_WORK_CITY",
    "EXT_SOURCE_1",	"EXT_SOURCE_2",	"EXT_SOURCE_3",	"APARTMENTS_AVG",	"BASEMENTAREA_AVG",	"YEARS_BEGINEXPLUATATION_AVG",	"YEARS_BUILD_AVG",	
    "COMMONAREA_AVG",	"ELEVATORS_AVG",	"ENTRANCES_AVG",	"FLOORSMAX_AVG",	"FLOORSMIN_AVG",	"LANDAREA_AVG",	"LIVINGAPARTMENTS_AVG",	
    "LIVINGAREA_AVG",	"NONLIVINGAPARTMENTS_AVG",	"NONLIVINGAREA_AVG",	"APARTMENTS_MODE",	"BASEMENTAREA_MODE",	"YEARS_BEGINEXPLUATATION_MODE",	
    "YEARS_BUILD_MODE",	"COMMONAREA_MODE",	"ELEVATORS_MODE",	"ENTRANCES_MODE",	"FLOORSMAX_MODE",	"FLOORSMIN_MODE",	"LANDAREA_MODE",	
    "LIVINGAPARTMENTS_MODE",	"LIVINGAREA_MODE",	"NONLIVINGAPARTMENTS_MODE",	"NONLIVINGAREA_MODE",	"APARTMENTS_MEDI",	"BASEMENTAREA_MEDI",	
    "YEARS_BEGINEXPLUATATION_MEDI",	"YEARS_BUILD_MEDI",	"COMMONAREA_MEDI",
    "ELEVATORS_MEDI",	"ENTRANCES_MEDI",	"FLOORSMAX_MEDI",	"FLOORSMIN_MEDI",	"LANDAREA_MEDI",
    "LIVINGAPARTMENTS_MEDI",	"LIVINGAREA_MEDI",	"NONLIVINGAPARTMENTS_MEDI",	"NONLIVINGAREA_MEDI", "TOTALAREA_MODE",
    "OBS_30_CNT_SOCIAL_CIRCLE",	"DEF_30_CNT_SOCIAL_CIRCLE",	"OBS_60_CNT_SOCIAL_CIRCLE",	"DEF_60_CNT_SOCIAL_CIRCLE",	
    "DAYS_LAST_PHONE_CHANGE",	"FLAG_DOCUMENT_2",	"FLAG_DOCUMENT_3",	"FLAG_DOCUMENT_4",	"FLAG_DOCUMENT_5",	"FLAG_DOCUMENT_6",	
    "FLAG_DOCUMENT_7",	"FLAG_DOCUMENT_8",	"FLAG_DOCUMENT_9",	"FLAG_DOCUMENT_10",	"FLAG_DOCUMENT_11",	"FLAG_DOCUMENT_12",	
    "FLAG_DOCUMENT_13",	"FLAG_DOCUMENT_14",	"FLAG_DOCUMENT_15",	"FLAG_DOCUMENT_16",	"FLAG_DOCUMENT_17",	"FLAG_DOCUMENT_18",
    "FLAG_DOCUMENT_19",	"FLAG_DOCUMENT_20",	"FLAG_DOCUMENT_21",	"AMT_REQ_CREDIT_BUREAU_HOUR",	"AMT_REQ_CREDIT_BUREAU_DAY",
    "AMT_REQ_CREDIT_BUREAU_WEEK",	"AMT_REQ_CREDIT_BUREAU_MON",	"AMT_REQ_CREDIT_BUREAU_QRT",	"AMT_REQ_CREDIT_BUREAU_YEAR"	
	]

cat_features = ["NAME_CONTRACT_TYPE","CODE_GENDER", "NAME_TYPE_SUITE","NAME_INCOME_TYPE",
                    "NAME_EDUCATION_TYPE","NAME_FAMILY_STATUS","NAME_HOUSING_TYPE", "OCCUPATION_TYPE", 
                    "WEEKDAY_APPR_PROCESS_START","ORGANIZATION_TYPE", "FONDKAPREMONT_MODE", "HOUSETYPE_MODE",
                    "WALLSMATERIAL_MODE", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", "EMERGENCYSTATE_MODE"]

In [5]:
def read_data(path):
    data = pd.read_csv(path,
                            infer_datetime_format=True,
                            on_bad_lines='warn',
                            skip_blank_lines=True)
    try:
        df = data.sort_index()
        df = df.set_index("SK_ID_CURR")
    except:
        print("Unexpected error:", sys.exc_info()[0])
    print('\n', df.dtypes)
    return df

In [24]:
path = r'..\data\raw\application_train.csv'
path_test = r'..\data\raw\application_test.csv'
train = read_data(path)
y = train.TARGET
train, test = read_data(path), read_data(path_test)
X_train, X_test = train.iloc[:, 1:240], test.iloc[:, 1:240]
y_train, y_test = train.TARGET, train.TARGET

categorical_transformer = Pipeline(
    [
        ('imputer_cat', SimpleImputer(strategy = 'constant',
          fill_value = 'missing')),
        ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
    ]
)

numeric_transformer = Pipeline(
    steps=[
        ('imputer_num', SimpleImputer(strategy = 'median')),
        ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('categoricals', categorical_transformer, cat_features),
        ('numericals', numeric_transformer, num_features)
    ],
    remainder = 'drop'
)

pipeline = Pipeline(
        [
            ('preprocessing', preprocessor),
            ('clf', XGBClassifier(scale_pos_weight=(1 - y.mean()), n_jobs=-1))
        ]
)


print('fitting preprocessor')

print('fitting one hot encoder...')
pipeline.named_steps['preprocessing'].transformers[0][1]\
   .named_steps['onehot']\
   .fit(X_train[cat_features],y_train)

print('fitting imputer for categocial features')
pipeline.named_steps['preprocessing'].transformers[0][1]\
    .named_steps['imputer_cat']\
    .fit(X_train[cat_features],y_train)


print('fitting imputer for numerical features')
pipeline.named_steps['preprocessing'].transformers[1][1]\
    .named_steps['imputer_num']\
    .fit(X_train[num_features],y_train)

print('fitting standard scaler...')
pipeline.named_steps['preprocessing'].transformers[1][1]\
   .named_steps['scaler']\
   .fit(X_train[num_features],y_train)


pickle.dump(pipeline, open('../models/pipe.pkl','wb'))
print('model saved. OK')



 TARGET                          int64
NAME_CONTRACT_TYPE             object
CODE_GENDER                    object
FLAG_OWN_CAR                   object
FLAG_OWN_REALTY                object
                               ...   
AMT_REQ_CREDIT_BUREAU_DAY     float64
AMT_REQ_CREDIT_BUREAU_WEEK    float64
AMT_REQ_CREDIT_BUREAU_MON     float64
AMT_REQ_CREDIT_BUREAU_QRT     float64
AMT_REQ_CREDIT_BUREAU_YEAR    float64
Length: 121, dtype: object

 TARGET                          int64
NAME_CONTRACT_TYPE             object
CODE_GENDER                    object
FLAG_OWN_CAR                   object
FLAG_OWN_REALTY                object
                               ...   
AMT_REQ_CREDIT_BUREAU_DAY     float64
AMT_REQ_CREDIT_BUREAU_WEEK    float64
AMT_REQ_CREDIT_BUREAU_MON     float64
AMT_REQ_CREDIT_BUREAU_QRT     float64
AMT_REQ_CREDIT_BUREAU_YEAR    float64
Length: 121, dtype: object

 NAME_CONTRACT_TYPE             object
CODE_GENDER                    object
FLAG_OWN_CAR                

In [15]:
# Some such as default would be binary features, but since
# they have a third class "unknown" we'll process them as non binary categorical
num_features = ["CNT_CHILDREN",	"AMT_INCOME_TOTAL",	"AMT_CREDIT",	"AMT_ANNUITY",	"AMT_GOODS_PRICE", "REGION_POPULATION_RELATIVE",	"DAYS_BIRTH",	"DAYS_EMPLOYED",
	"DAYS_REGISTRATION",	"DAYS_ID_PUBLISH",	"OWN_CAR_AGE",	"FLAG_MOBIL",	"FLAG_EMP_PHONE",	"FLAG_WORK_PHONE",	"FLAG_CONT_MOBILE",	"FLAG_PHONE",	"FLAG_EMAIL",
    "CNT_FAM_MEMBERS",	"REGION_RATING_CLIENT",	"REGION_RATING_CLIENT_W_CITY", "HOUR_APPR_PROCESS_START",	"REG_REGION_NOT_LIVE_REGION",	"REG_REGION_NOT_WORK_REGION",	
    "LIVE_REGION_NOT_WORK_REGION",	"REG_CITY_NOT_LIVE_CITY","REG_CITY_NOT_WORK_CITY",	"LIVE_CITY_NOT_WORK_CITY",
    "EXT_SOURCE_1",	"EXT_SOURCE_2",	"EXT_SOURCE_3",	"APARTMENTS_AVG",	"BASEMENTAREA_AVG",	"YEARS_BEGINEXPLUATATION_AVG",	"YEARS_BUILD_AVG",	
    "COMMONAREA_AVG",	"ELEVATORS_AVG",	"ENTRANCES_AVG",	"FLOORSMAX_AVG",	"FLOORSMIN_AVG",	"LANDAREA_AVG",	"LIVINGAPARTMENTS_AVG",	
    "LIVINGAREA_AVG",	"NONLIVINGAPARTMENTS_AVG",	"NONLIVINGAREA_AVG",	"APARTMENTS_MODE",	"BASEMENTAREA_MODE",	"YEARS_BEGINEXPLUATATION_MODE",	
    "YEARS_BUILD_MODE",	"COMMONAREA_MODE",	"ELEVATORS_MODE",	"ENTRANCES_MODE",	"FLOORSMAX_MODE",	"FLOORSMIN_MODE",	"LANDAREA_MODE",	
    "LIVINGAPARTMENTS_MODE",	"LIVINGAREA_MODE",	"NONLIVINGAPARTMENTS_MODE",	"NONLIVINGAREA_MODE",	"APARTMENTS_MEDI",	"BASEMENTAREA_MEDI",	
    "YEARS_BEGINEXPLUATATION_MEDI",	"YEARS_BUILD_MEDI",	"COMMONAREA_MEDI",
    "ELEVATORS_MEDI",	"ENTRANCES_MEDI",	"FLOORSMAX_MEDI",	"FLOORSMIN_MEDI",	"LANDAREA_MEDI",
    "LIVINGAPARTMENTS_MEDI",	"LIVINGAREA_MEDI",	"NONLIVINGAPARTMENTS_MEDI",	"NONLIVINGAREA_MEDI", "TOTALAREA_MODE",
    "OBS_30_CNT_SOCIAL_CIRCLE",	"DEF_30_CNT_SOCIAL_CIRCLE",	"OBS_60_CNT_SOCIAL_CIRCLE",	"DEF_60_CNT_SOCIAL_CIRCLE",	
    "DAYS_LAST_PHONE_CHANGE",	"FLAG_DOCUMENT_2",	"FLAG_DOCUMENT_3",	"FLAG_DOCUMENT_4",	"FLAG_DOCUMENT_5",	"FLAG_DOCUMENT_6",	
    "FLAG_DOCUMENT_7",	"FLAG_DOCUMENT_8",	"FLAG_DOCUMENT_9",	"FLAG_DOCUMENT_10",	"FLAG_DOCUMENT_11",	"FLAG_DOCUMENT_12",	
    "FLAG_DOCUMENT_13",	"FLAG_DOCUMENT_14",	"FLAG_DOCUMENT_15",	"FLAG_DOCUMENT_16",	"FLAG_DOCUMENT_17",	"FLAG_DOCUMENT_18",
    "FLAG_DOCUMENT_19",	"FLAG_DOCUMENT_20",	"FLAG_DOCUMENT_21",	"AMT_REQ_CREDIT_BUREAU_HOUR",	"AMT_REQ_CREDIT_BUREAU_DAY",
    "AMT_REQ_CREDIT_BUREAU_WEEK",	"AMT_REQ_CREDIT_BUREAU_MON",	"AMT_REQ_CREDIT_BUREAU_QRT",	"AMT_REQ_CREDIT_BUREAU_YEAR"	
	]

cat_features = ["NAME_CONTRACT_TYPE","CODE_GENDER", "NAME_TYPE_SUITE","NAME_INCOME_TYPE",
                    "NAME_EDUCATION_TYPE","NAME_FAMILY_STATUS","NAME_HOUSING_TYPE", "OCCUPATION_TYPE", 
                    "WEEKDAY_APPR_PROCESS_START","ORGANIZATION_TYPE", "FONDKAPREMONT_MODE", "HOUSETYPE_MODE",
                    "WALLSMATERIAL_MODE", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", "EMERGENCYSTATE_MODE"]


import os
import pickle

# Check if the directory exists
if not os.path.exists("../data/features"):
    os.makedirs("../data/features")
file_path = "../data/features/num_features.pkl"
file_path2 = "../data/features/cat_features.pkl"

if not os.path.isfile(file_path):
    with open(file_path, "wb") as f:
        pickle.dump(num_features, f)

if not os.path.isfile(file_path2):
    with open(file_path2, "wb") as f:
        pickle.dump(cat_features, f)


'def save_file(path, liste):\n    file = open(path,\'w\')\n    for item in num_features:\n        file.write(item+"\n")\n    file.close()\n    \nsave_file(\'../data/features/num_features.txt\', num_features)\nsave_file(\'../data/features/cat_features.txt\', cat_features)'