In [12]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.preprocessing import Imputer, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, confusion_matrix, accuracy_score
%matplotlib inline

### Reading Train dataset

In [2]:
pd.set_option('display.max_rows', 500)
application_train_df = pd.read_csv("/home/jyothish/Projects/Home-Credit-Default-Risk-Prediction/data/home-credit-default-risk_data/application_train.csv")
application_train_df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Categorical features
categorical_features = [feature for feature in application_train_df.columns if application_train_df[feature].dtypes == 'O' ]
print(categorical_features)

['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']


In [4]:
# Numerical features
numerical_features = [feature for feature in application_train_df.columns if application_train_df[feature].dtypes != 'O' ]
print(numerical_features)

['SK_ID_CURR', 'TARGET', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEAR

### Filling missing data for numerical features

In [5]:
application_train_relevant_df = application_train_df[numerical_features]

imputer =Imputer(strategy = 'median')

filled_numerical_df = imputer.fit_transform(application_train_relevant_df)

filled_numerical_df = pd.DataFrame(data=filled_numerical_df, columns=application_train_relevant_df.columns.values)



### Create dummies for categorical features

In [6]:
def create_dummy_df(num_df, cat_df, dummy_na):

    '''
    INPUT:
    num_df - pandas dataframe with numerical variables
    cat_df - pandas dataframe with categorical variables
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not

    
    OUTPUT:
    df - a new dataframe that has the following characteristics:
            1. dummy columns for each of the categorical columns in cat_df
            2. if dummy_na is True - it also contains dummy columns for the NaN values
            3. Use a prefix of the column name with an underscore (_) for separating 
    '''
    for col in cat_df.columns:

        try:
            num_df = pd.concat([num_df, pd.get_dummies(cat_df[col], prefix=col,
                                                       prefix_sep='_',drop_first=True,
                                                       dummy_na=dummy_na)], axis=1)
        except:
            continue

    return num_df

In [7]:
concat_df = create_dummy_df(filled_numerical_df, application_train_df[categorical_features], dummy_na=False)

In [16]:
concat_df.head()

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,FONDKAPREMONT_MODE_reg oper spec account,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_Yes
0,100002.0,1.0,0.0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461.0,-637.0,...,0,0,0,0,0,0,0,1,0,0
1,100003.0,0.0,0.0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765.0,-1188.0,...,0,0,0,0,0,0,0,0,0,0
2,100004.0,0.0,0.0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046.0,-225.0,...,0,0,0,0,0,0,0,0,0,0
3,100006.0,0.0,0.0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005.0,-3039.0,...,0,0,0,0,0,0,0,0,0,0
4,100007.0,0.0,0.0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932.0,-3038.0,...,0,0,0,0,0,0,0,0,0,0


### Feature engineering

###  Modelling and Evaluation

In [15]:
def fit_mod(concat_df, test_size=.3, rand_state=42):
    '''
    INPUT:
    concat_df - a dataframe holding all the variables of interest
    test_size - a float between [0,1] about what proportion of data should
                be in the test dataset
    rand_state - an int that is provided as the random state for splitting 
                 the data into training and test 
    
    OUTPUT:
    test_score - float - r2 score on the test data
    train_score - float - r2 score on the test data
    lm_model - model object from sklearn
    X_train, X_test, y_train, y_test - output from sklearn train test split used for optimal model
    '''
   
    # Seperate feature and label
    X = concat_df.drop(columns=['TARGET','SK_ID_CURR'], axis=1)
    X = np.array(X)
    y = concat_df['TARGET'].astype(int)
    y = np.array(y)
    
    # Normalize the data
    scaler = MinMaxScaler(feature_range = (0, 1))
    X = scaler.fit_transform(X)
    

    #Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size,
                                                        random_state=rand_state) 
    
    model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary', 
                                   class_weight = 'balanced', learning_rate = 0.05, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, 
                                   subsample = 0.8, n_jobs = -1, random_state = 50)
    model.fit(X_train, y_train)
    
    #Predict and score the model
    y_train_preds = model.predict(X_train) 
    y_test_preds = model.predict(X_test) 

    accuracy = accuracy_score(y_test, y_test_preds)

    return accuracy

#Test your function with the above dataset
accuracy = fit_mod(concat_df)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   20.9s finished


[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
('Accuracy Score for Random forest:', 0.8837557179092506)


In [None]:
#Print training and testing score
print("The accuracy on test data is: ".\
      format(accuracy))

### Reading other datasets

In [None]:
# Bureau dataframe
bureau_df = pd.read_csv("/home/jyothish/Projects/Home-Credit-Default-Risk-Prediction/data/home-credit-default-risk_data/bureau.csv")
bureau_df.head()

In [None]:
# Bureau Balance dataframe
bureau_balance_df = pd.read_csv("/home/jyothish/Projects/Home-Credit-Default-Risk-Prediction/data/home-credit-default-risk_data/bureau_balance.csv")
bureau_balance_df.head()

In [None]:
# Credit Card Balance dataframe
credit_card_balance_df = pd.read_csv("/home/jyothish/Projects/Home-Credit-Default-Risk-Prediction/data/home-credit-default-risk_data/credit_card_balance.csv")
credit_card_balance_df.head()

In [None]:
# HomeCredit Columns Description dataframe
HomeCredit_columns_description_df = pd.read_csv("/home/jyothish/Projects/Home-Credit-Default-Risk-Prediction/data/home-credit-default-risk_data/HomeCredit_columns_description.csv")
HomeCredit_columns_description_df.head()

In [None]:
# Installments Payments dataframe
installments_payments_df = pd.read_csv("/home/jyothish/Projects/Home-Credit-Default-Risk-Prediction/data/home-credit-default-risk_data/installments_payments.csv")
installments_payments_df.head()

In [None]:
# POS_CASH Balance dataframe
POS_CASH_balance_df = pd.read_csv("/home/jyothish/Projects/Home-Credit-Default-Risk-Prediction/data/home-credit-default-risk_data/POS_CASH_balance.csv")
POS_CASH_balance_df.head()

In [None]:
# Previous Application dataframe
previous_application_df = pd.read_csv("/home/jyothish/Projects/Home-Credit-Default-Risk-Prediction/data/home-credit-default-risk_data/previous_application.csv")
previous_application_df.head()

### Merge dataframes