In [1]:
import time
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [9]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

In [10]:
from sklearn.model_selection import train_test_split

In [2]:
from pandas.core.common import SettingWithCopyWarning
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
label_encoder = preprocessing.LabelEncoder()
oh_encoder = preprocessing.OneHotEncoder()
oh_columns = set()

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

In [3]:
pd.options.display.float_format = '{:,.2f}'.format

In [4]:
df_train = pd.read_csv('data/application_train.csv')

In [5]:
df_test = pd.read_csv('data/application_test.csv')

In [6]:
def data_transform(df_train):
    oh_columns = set()
    #CAR_STATUS
    df_train['CAR_STATUS'] = pd.qcut(df_train['OWN_CAR_AGE'], 3, labels=['NEW', 'OLD', 'Vintage'])
    df_train['CAR_STATUS']=(df_train['CAR_STATUS'].cat.add_categories('NO').fillna('NO'))
    df_train['CAR_STATUS'] = label_encoder.fit_transform(df_train['CAR_STATUS'])
    df_train.drop(['OWN_CAR_AGE','FLAG_OWN_CAR'], axis = 1, inplace = True)

    # Normalizing Columns for AVG, MODE, MEDI
    norm_columns = list(df_train.loc[:, df_train.columns.str.endswith('_AVG')].columns)
    for column in norm_columns:
        column_AVG = column
        column_MEDI = column.replace("_AVG","_MEDI")
        column_MODE = column.replace("_AVG","_MODE")
        column_NORM = column.replace("_AVG","_NORM")
        df_train[column_NORM] = df_train[[column_AVG,column_MEDI,column_MODE]].mean(axis=1)
        df_train.drop([column_AVG, column_MEDI, column_MODE], axis = 1, inplace = True)

    # NAME_CONTRACT_TYPE
    df_train['NAME_CONTRACT_TYPE'] = label_encoder.fit_transform(df_train['NAME_CONTRACT_TYPE'])


    #CODE_GENDER
    #Remove XNA
    df_train = df_train[df_train['CODE_GENDER'] != 'XNA']
    df_train['CODE_GENDER'].value_counts()
    df_train['CODE_GENDER'] = label_encoder.fit_transform(df_train['CODE_GENDER'])

    #Converting all flag columns
    flag_columns = list(df_train.loc[:, df_train.columns.str.startswith('FLAG_')].columns)
    flag_columns.extend(['REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 
                    'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY'])
    for column in flag_columns:
        df_train[column] = label_encoder.fit_transform(df_train[column])


    #converting all named & categorical columns
    name_columns = df_train.loc[:, df_train.columns.str.startswith('NAME_')].columns
    df_train['NAME_TYPE_SUITE'].replace(['Other_A', 'Other_B'], 'Other', inplace = True)
    #Remove unknown?
    df_train = df_train[df_train['NAME_FAMILY_STATUS'] != 'Unknown']
    #Civil Marriage is Marriage
    df_train['NAME_FAMILY_STATUS'].replace('Civil marriage', 'Married', inplace = True)
    # One Hot encoding?
    oh_columns.add('NAME_TYPE_SUITE')
    oh_columns.add('NAME_INCOME_TYPE')
    oh_columns.add('NAME_EDUCATION_TYPE')
    oh_columns.add('NAME_FAMILY_STATUS')
    oh_columns.add('NAME_HOUSING_TYPE')
    oh_columns.add('REGION_RATING_CLIENT')
    oh_columns.add('REGION_RATING_CLIENT_W_CITY')
    oh_columns.add('FONDKAPREMONT_MODE')
    oh_columns.add('HOUSETYPE_MODE')
    oh_columns.add('WALLSMATERIAL_MODE')  
    df_train = pd.get_dummies(df_train, columns = list(oh_columns))

    #Children count
    df_train['CNT_CHILDREN_NORM'] = min_max_scaler.fit_transform(df_train['CNT_CHILDREN'].values.reshape(-1, 1))
    df_train.drop(['CNT_CHILDREN'], axis = 1, inplace = True)

    #AMOUNT Columns
    #Retain 99%
    df_train = df_train[df_train['AMT_INCOME_TOTAL'] < df_train.AMT_INCOME_TOTAL.quantile(0.99)]
    df_train['AMT_INCOME_TOTAL_NORM'] = min_max_scaler.fit_transform(df_train['AMT_INCOME_TOTAL'].values.reshape(-1, 1))
    df_train.drop(['AMT_INCOME_TOTAL'], axis = 1, inplace = True)

    df_train = df_train[df_train['AMT_CREDIT'] < df_train.AMT_CREDIT.quantile(0.99)]
    df_train['AMT_CREDIT_NORM'] = min_max_scaler.fit_transform(df_train['AMT_CREDIT'].values.reshape(-1, 1))
    df_train.drop(['AMT_CREDIT'], axis = 1, inplace = True)

    df_train['AMT_ANNUITY'] = df_train['AMT_ANNUITY'].fillna(df_train['AMT_ANNUITY'].median())
    df_train = df_train[df_train['AMT_ANNUITY'] < df_train.AMT_ANNUITY.quantile(0.99)]
    df_train['AMT_ANNUITY_NORM'] = min_max_scaler.fit_transform(df_train['AMT_ANNUITY'].values.reshape(-1, 1))
    df_train.drop(['AMT_ANNUITY'], axis = 1, inplace = True)

    df_train['AMT_GOODS_PRICE'] = df_train['AMT_GOODS_PRICE'].fillna(df_train['AMT_GOODS_PRICE'].median())
    df_train = df_train[df_train['AMT_GOODS_PRICE'] < df_train.AMT_GOODS_PRICE.quantile(0.99)]
    df_train['AMT_GOODS_PRICE_NORM'] = min_max_scaler.fit_transform(df_train['AMT_GOODS_PRICE'].values.reshape(-1, 1))
    df_train.drop(['AMT_GOODS_PRICE'], axis = 1, inplace = True)

    #Days_BIRTH
    df_train['YEARS_BIRTH'] = df_train['DAYS_BIRTH'].abs()/365
    df_train.drop(['DAYS_BIRTH'], axis = 1, inplace = True)
    df_train['YEARS_BIRTH_NORM'] = min_max_scaler.fit_transform(df_train['YEARS_BIRTH'].values.reshape(-1, 1))
    df_train.drop(['YEARS_BIRTH'], axis = 1, inplace = True)

    #days_employed
    df_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
    df_train['DAYS_EMPLOYED_NORM'] = min_max_scaler.fit_transform(df_train['DAYS_EMPLOYED'].values.reshape(-1, 1))
    df_train.drop(['DAYS_EMPLOYED'], axis = 1, inplace = True)

    df_train['DAYS_REGISTRATION_NORM'] = min_max_scaler.fit_transform(df_train['DAYS_REGISTRATION'].values.reshape(-1, 1))
    df_train.drop(['DAYS_REGISTRATION'], axis = 1, inplace = True)

    df_train['DAYS_ID_PUBLISH_NORM'] = min_max_scaler.fit_transform(df_train['DAYS_ID_PUBLISH'].values.reshape(-1, 1))
    df_train.drop(['DAYS_ID_PUBLISH'], axis = 1, inplace = True)

    #Normalize some more columns
    Normalize_columns = ['AMT_REQ_CREDIT_BUREAU_HOUR',
    'AMT_REQ_CREDIT_BUREAU_DAY',
    'AMT_REQ_CREDIT_BUREAU_WEEK',
    'AMT_REQ_CREDIT_BUREAU_MON',
    'AMT_REQ_CREDIT_BUREAU_QRT',
    'AMT_REQ_CREDIT_BUREAU_YEAR',
    'OBS_30_CNT_SOCIAL_CIRCLE',
    'DEF_30_CNT_SOCIAL_CIRCLE',
    'OBS_60_CNT_SOCIAL_CIRCLE',
    'DEF_60_CNT_SOCIAL_CIRCLE',
    'DAYS_LAST_PHONE_CHANGE',
    'CNT_FAM_MEMBERS'
    ]

    for column in Normalize_columns:
        df_train['{}_NORM'.format(column)] = min_max_scaler.fit_transform(df_train[column].values.reshape(-1, 1))
        df_train.drop([column], axis = 1, inplace = True)


    #drop occupation type
    df_train.drop(['OCCUPATION_TYPE'], axis = 1, inplace = True)

    #HOUR APPR PRocess start
    #convert to day (6am [6] to 10pm [22]) and night (10pm [22] to 6am [6])
    df_train['HOUR_APPR_PROCESS_DAY'] = (df_train['HOUR_APPR_PROCESS_START'] > 6) & (df_train['HOUR_APPR_PROCESS_START'] < 22)
    df_train.drop(['HOUR_APPR_PROCESS_START'], axis = 1, inplace = True)
    df_train['HOUR_APPR_PROCESS_DAY'] = label_encoder.fit_transform(df_train['HOUR_APPR_PROCESS_DAY'])

    #drop week & organization type
    df_train.drop(['WEEKDAY_APPR_PROCESS_START'], axis = 1, inplace = True)
    df_train.drop(['ORGANIZATION_TYPE'], axis = 1, inplace = True)

    df_train['EMERGENCYSTATE_MODE'] = label_encoder.fit_transform(df_train['EMERGENCYSTATE_MODE'])

    df_train.fillna(0, inplace=True)
    return df_train

In [7]:
df_transformed_train = data_transform(df_train)

In [8]:
df_transformed_test = data_transform(df_test)

In [12]:
y = df_transformed['TARGET']
x = df_transformed.drop(['TARGET'],axis=1)

NameError: name 'df_rfe_transformed' is not defined

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,  test_size=0.2, random_state=123)

In [None]:
algorithms = [
    ('LR',LogisticRegression(max_iter=500)),
    ('RFC',RandomForestClassifier()),
    ('LSCV',LinearSVC(max_iter=500)),
    ('GBC',GradientBoostingClassifier()),
    ('KNC',KNeighborsClassifier(n_neighbors=5)),
    ('SGD',SGDClassifier()),
    ('DTC',DecisionTreeClassifier())
]

In [None]:
cm_labels = np.unique(y)

In [None]:
#UNI
for name,algo in algorithms:
    print ()
    print ("============== Running {} ======".format(algo))
    t1 = time.perf_counter()
    model = algo.fit (x_train, np.ravel(y_train))
    t2 = time.perf_counter()
    y_pred = model.predict(x_test)
    t3 = time.perf_counter()
    
    cm_array = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm_array, index=cm_labels, columns=cm_labels)
    
    print ("Training time : {:,.1f} secs ({:,.1f} ms)".format ( (t2-t1), (t2-t1)*1e3))
    print ("Prediction time : {:,.1f} secs ({:,.1f} ms)".format ( (t3-t2), (t3-t2)*1e3))
    print ("Training score : ", model.score(x_train, y_train))
    print ("Testing score : ", model.score(x_test, y_test))
    print ("Confusion matrix : ")
    print (cm_df)

## Automated Feature Selection

In [13]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

In [14]:
y = df_transformed_train['TARGET']
x = df_transformed_train.drop(['TARGET'],axis=1)

In [15]:
target = y
features = x

In [16]:
select_univariate = SelectKBest(f_regression, k=50).fit(features, target)

In [19]:
feature_mask = select_univariate.get_support()
feature_mask

array([False,  True,  True, False,  True, False,  True,  True, False,
        True, False, False, False, False,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True, False, False, False, False,  True, False, False,  True,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False,  True, False,  True, False,  True, False, False, False,
        True,  True, False, False,  True,  True, False,  True, False,
        True, False, False,  True,  True, False, False, False, False,
       False, False,  True,  True,  True,  True,  True, False, False,
       False, False, False, False, False,  True, False,  True,  True,
       False, False]

In [20]:
features.columns[feature_mask]

Index(['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'REGION_POPULATION_RELATIVE',
       'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_PHONE',
       'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY',
       'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_1', 'EXT_SOURCE_2',
       'EXT_SOURCE_3', 'TOTALAREA_MODE', 'EMERGENCYSTATE_MODE',
       'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_6', 'APARTMENTS_NORM',
       'BASEMENTAREA_NORM', 'YEARS_BEGINEXPLUATATION_NORM', 'YEARS_BUILD_NORM',
       'ELEVATORS_NORM', 'ENTRANCES_NORM', 'FLOORSMAX_NORM', 'FLOORSMIN_NORM',
       'LANDAREA_NORM', 'LIVINGAPARTMENTS_NORM', 'LIVINGAREA_NORM',
       'FONDKAPREMONT_MODE_reg oper account',
       'NAME_HOUSING_TYPE_House / apartment', 'NAME_HOUSING_TYPE_With parents',
       'WALLSMATERIAL_MODE_Panel', 'NAME_FAMILY_STATUS_Single / not married',
       'REGION_RATING_CLIENT_1', 'REGION_RATING_CLIENT_3',
       'NAME_INCOME_TYPE_Pensioner', 'NAME_INCOME_TYPE_State servant',
       'NAME_INCOME_TYPE_Working', 'REGION_RATING_CLIE

In [17]:
select_univariate.scores_

array([1.51638119e+00, 2.91395917e+02, 9.66297465e+02, 1.37692114e+01,
       3.49865951e+02, 8.98325141e-02, 6.77922736e+02, 2.53919689e+02,
       1.17905215e-02, 1.58769853e+02, 1.76608286e-01, 1.25752129e+01,
       2.43896855e+01, 6.99569604e+00, 5.92440007e+02, 7.76701588e+02,
       3.14578033e+02, 1.20607260e+03, 7.59053953e+03, 4.27795382e+03,
       4.59739383e+02, 4.92676757e+02, 7.49149235e+00, 5.97214295e+02,
       1.97646892e+00, 2.97959055e-03, 2.62494227e+02, 2.91318624e-01,
       1.15778072e+01, 4.52214464e+00, 6.28841563e-01, 6.67234147e+00,
       8.98325141e-02, 3.28656966e+01, 2.58035780e+01, 1.14639434e+01,
       3.52747826e+01, 2.37147822e+00, 1.30533493e+01, 4.92928375e-01,
       7.30840626e-01, 4.21707947e+00, 7.56970765e+01, 4.32785126e+02,
       3.00303991e+02, 4.69327930e+02, 2.89396690e+02, 1.12466616e+02,
       3.34245117e+02, 4.12085373e+02, 6.75618168e+02, 3.05560340e+02,
       1.55245300e+02, 2.27398703e+02, 4.35254261e+02, 6.81346792e+00,
      

In [21]:
pd.DataFrame({'FeatureName': features.columns, 
              'Score': select_univariate.scores_}).sort_values(by='Score', 
                                                               ascending=False)

Unnamed: 0,FeatureName,Score
18,EXT_SOURCE_2,7590.54
19,EXT_SOURCE_3,4277.95
111,YEARS_BIRTH_NORM,1870.33
112,DAYS_EMPLOYED_NORM,1378.25
17,EXT_SOURCE_1,1206.07
...,...,...
61,NAME_HOUSING_TYPE_Co-op apartment,0.04
81,NAME_FAMILY_STATUS_Separated,0.03
115,AMT_REQ_CREDIT_BUREAU_HOUR_NORM,0.02
8,FLAG_CONT_MOBILE,0.01


In [22]:
uni_df = pd.DataFrame({'Univariate Method': features.columns[feature_mask]})
uni_df

Unnamed: 0,Univariate Method
0,NAME_CONTRACT_TYPE
1,CODE_GENDER
2,REGION_POPULATION_RELATIVE
3,FLAG_EMP_PHONE
4,FLAG_WORK_PHONE
5,FLAG_PHONE
6,REG_CITY_NOT_LIVE_CITY
7,REG_CITY_NOT_WORK_CITY
8,LIVE_CITY_NOT_WORK_CITY
9,EXT_SOURCE_1


In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

linear_regression = LinearRegression()

In [None]:
rfe = RFE(estimator = linear_regression, 
          n_features_to_select = 5, 
          step = 1)

In [None]:
rfe.fit(features,target)

In [None]:
rfe_features = features.columns[rfe.support_]
rfe_features

In [None]:
pd.DataFrame({'FeatureName': features.columns, 
              'Rank': rfe.ranking_}).sort_values(by='Rank')

In [None]:
rfe_df = pd.DataFrame({'RFE Method': rfe_features})

rfe_df

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector

In [None]:
feature_filtered = feature_selector.fit(features, target)

In [None]:
back_df = pd.DataFrame({'Backward Method': backward_features})
back_df

In [None]:
feature_selector = SequentialFeatureSelector(LinearRegression(),  
                                             k_features=5,
                                             forward=True,
                                             scoring='neg_mean_squared_error',
                                             cv=4)

In [None]:
feature_filtered = feature_selector.fit(features, target)

In [None]:
forward_features = list(feature_filtered.k_feature_names_)
forward_features

In [None]:
forw_df = pd.DataFrame({'Forward Method': forward_features})
forw_df

In [None]:
from sklearn.linear_model import  Lasso

In [None]:
lasso = Lasso(alpha=1.0)
lasso.fit(features, target)

In [None]:
lasso_coef = pd.DataFrame({'Feature': features.columns, 
                           'LassoCoef': lasso.coef_}).sort_values(by = 'LassoCoef',
                                                                  ascending =False)
lasso_coef

In [None]:
lasso_coef['LassoCoef'] = abs(lasso_coef['LassoCoef'])
lasso_coef.sort_values(by='LassoCoef', ascending=False)

In [None]:
lasso_df = lasso_coef.sort_values(by='LassoCoef', ascending=False).head(5)
lasso_df = pd.DataFrame({'Lasso Method': lasso_df['Feature'].values})
lasso_df

In [None]:
comp_selected_col_df = [uni_df, 
                        rfe_df, 
                        back_df, 
                        forw_df, 
                        lasso_df]

final_df = pd.concat(comp_selected_col_df, axis=1)

In [None]:
final_df[:5]