In [9]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.preprocessing import Imputer, MinMaxScaler, StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, confusion_matrix, accuracy_score
%matplotlib inline

### Reading Train dataset

In [10]:
pd.set_option('display.max_rows', 500)
application_train_df = pd.read_csv("/home/jyothish/Projects/Home-Credit-Default-Risk-Prediction/data/home-credit-default-risk_data/application_train.csv")
application_train_df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


### Filling missing data for numerical features

In [11]:
def clean_data(df):
    """
    INPUT:
    df - dataframe that has missing values  
        
    OUTPUT:
    filled_numerical_df - dataframe with filled missing values
    """

    # Numerical features
    numerical_features = [feature for feature in df.columns if df[feature].dtypes != 'O' ]
    
    # Fill missing values with median of column
    imputer =Imputer(strategy = 'median')
    filled_numerical_df = imputer.fit_transform(df[numerical_features])
    filled_numerical_df = pd.DataFrame(data=filled_numerical_df, columns=df[numerical_features].columns.values)
    
    return filled_numerical_df

### Feature engineering  / Polynomial features

In [12]:
# Features to be transformed
poly_features_df = application_train_df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]

# Fill missing values for numerical data types in poly_features_df dataframe
poly_features_df = clean_data(poly_features_df)



In [13]:
# Create the polynomial object with specified degree
poly_transformer = PolynomialFeatures(degree = 3)

# Train the polynomial features
poly_transformer.fit(poly_features_df)

# Transform the features
poly_features = poly_transformer.transform(poly_features_df)

poly_features = pd.DataFrame(poly_features, 
                             columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 
                                                                           'EXT_SOURCE_3', 'DAYS_BIRTH']))

In [14]:
# Droping columns
poly_features.drop(columns=['1', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'], axis=1, inplace=True)

### Create dummies for categorical features

In [15]:
def create_dummy_df(num_df, cat_df, dummy_na):

    """
    INPUT:
    num_df - pandas dataframe with numerical variables
    cat_df - pandas dataframe with categorical variables
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not

    
    OUTPUT:
    df - a new dataframe that has the following characteristics:
            1. dummy columns for each of the categorical columns in cat_df
            2. if dummy_na is True - it also contains dummy columns for the NaN values
            3. Use a prefix of the column name with an underscore (_) for separating 
    """
    for col in cat_df.columns:

        try:
            num_df = pd.concat([num_df, pd.get_dummies(cat_df[col], prefix=col,
                                                       prefix_sep='_',drop_first=True,
                                                       dummy_na=dummy_na)], axis=1)
        except:
            continue

    return num_df

In [17]:
# Categorical features
categorical_features = [feature for feature in application_train_df.columns if application_train_df[feature].dtypes == 'O' ]

# Filling missing values for numerical data types in application_train_df dataframe
filled_numerical_train_df = clean_data(application_train_df)

# Creating dummy for categorical features 
concat_train_df = create_dummy_df(filled_numerical_train_df, application_train_df[categorical_features], dummy_na=False)



In [18]:
# Concatinating train dataframe with polynomial features
concat_train_df = pd.concat([concat_train_df,poly_features], axis=1)

###  Modelling and Evaluation

In [19]:
def fit_model(concat_train_df, test_size=.3, rand_state=42):
    '''
    INPUT:
    concat_df - a dataframe holding all the variables of interest
    test_size - a float between [0,1] about what proportion of data should
                be in the test dataset
    rand_state - an int that is provided as the random state for splitting 
                 the data into training and test 
    
    OUTPUT:
    accuracy_train - float - accuracy score on the test data
    accuracy_test - float - accuracy score on the test data
    model - model object from sklearn
    X_train, X_test, y_train, y_test - output from sklearn train test split used for optimal model
    '''
   
    # Seperate feature and label
    X = concat_train_df.drop(columns=['TARGET','SK_ID_CURR'], axis=1)
    X = np.array(X)
    y = concat_train_df['TARGET'].astype(int)
    y = np.array(y)
    
    # Normalize the data
    scaler = MinMaxScaler(feature_range = (0, 1))
    X = scaler.fit_transform(X)
    

    #Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size,
                                                        random_state=rand_state) 
    
    #model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary', 
    #                               class_weight = 'balanced', learning_rate = 0.05, 
    #                               reg_alpha = 0.1, reg_lambda = 0.1, 
    #                               subsample = 0.8, n_jobs = -1, random_state = 50)
    
    #model = XGBClassifier()
    model = RandomForestClassifier(class_weight='balanced')

    model.fit(X_train, y_train)
    
    #Predict and score the model
    y_train_preds = model.predict(X_train) 
    y_test_preds = model.predict(X_test) 

    accuracy_train = accuracy_score(y_train, y_train_preds)
    accuracy_test = accuracy_score(y_test, y_test_preds)

    return accuracy_train, accuracy_test

#Test your function 
accuracy_train, accuracy_test = fit_model(concat_train_df)



In [None]:
#Print train and test score
print("The accuracy on train data is: ", accuracy_train)
print("The accuracy on test data is: ", accuracy_test)

### Reading other datasets

In [None]:
# Bureau dataframe
bureau_df = pd.read_csv("/home/jyothish/Projects/Home-Credit-Default-Risk-Prediction/data/home-credit-default-risk_data/bureau.csv")
bureau_df.head()

In [None]:
# Bureau Balance dataframe
bureau_balance_df = pd.read_csv("/home/jyothish/Projects/Home-Credit-Default-Risk-Prediction/data/home-credit-default-risk_data/bureau_balance.csv")
bureau_balance_df.head()

In [None]:
# Credit Card Balance dataframe
credit_card_balance_df = pd.read_csv("/home/jyothish/Projects/Home-Credit-Default-Risk-Prediction/data/home-credit-default-risk_data/credit_card_balance.csv")
credit_card_balance_df.head()

In [None]:
# HomeCredit Columns Description dataframe
HomeCredit_columns_description_df = pd.read_csv("/home/jyothish/Projects/Home-Credit-Default-Risk-Prediction/data/home-credit-default-risk_data/HomeCredit_columns_description.csv")
HomeCredit_columns_description_df.head()

In [None]:
# Installments Payments dataframe
installments_payments_df = pd.read_csv("/home/jyothish/Projects/Home-Credit-Default-Risk-Prediction/data/home-credit-default-risk_data/installments_payments.csv")
installments_payments_df.head()

In [None]:
# POS_CASH Balance dataframe
POS_CASH_balance_df = pd.read_csv("/home/jyothish/Projects/Home-Credit-Default-Risk-Prediction/data/home-credit-default-risk_data/POS_CASH_balance.csv")
POS_CASH_balance_df.head()

In [None]:
# Previous Application dataframe
previous_application_df = pd.read_csv("/home/jyothish/Projects/Home-Credit-Default-Risk-Prediction/data/home-credit-default-risk_data/previous_application.csv")
previous_application_df.head()