In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier


submit_loc = r"C:\Users\Kushagra Jalota\sub.csv"

train_loc = r"C:\Users\Kushagra Jalota\Loan-Prediction-Classification-master\claim_trainnew.csv"
test_loc = r"C:\Users\Kushagra Jalota\Loan-Prediction-Classification-master\claim_testnew.csv"

train = pd.read_csv(train_loc)
test = pd.read_csv(test_loc)



In [2]:
#Create dummy variables then shuffle the train set.

cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Credit_History', 'Property_Area']

train = pd.get_dummies(train, columns = cols)
test = pd.get_dummies(test, columns = cols)

train = train.sample(frac=1).reset_index(drop=True)

In [3]:
#Set the target variable.

y = train.Loan_Status
train.columns

Index(['Loan_ID', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Loan_Status', 'Gender_Female', 'Gender_Male',
       'Gender_Unknown', 'Married_No', 'Married_Unknown', 'Married_Yes',
       'Dependents_0', 'Dependents_1', 'Dependents_2', 'Dependents_3',
       'Dependents_Unknown', 'Education_Graduate', 'Education_Not Graduate',
       'Self_Employed_No', 'Self_Employed_Unknown', 'Self_Employed_Yes',
       'Credit_History_No', 'Credit_History_Unknown', 'Credit_History_Yes',
       'Property_Area_Rural', 'Property_Area_Semiurban',
       'Property_Area_Urban'],
      dtype='object')

In [4]:
#Drop the unnecessary columns and prepare the data for our submission file.

train =  train.drop(['Loan_ID', 'Loan_Status'],axis=1)
X = train.astype(np.float32)
submit = test['Loan_ID']

In [5]:
#Do the same for the test set. Unlike the train set there was no unknown data in the married 
#column on the test set. As a result we have to insert a Married Unknown column as Pandas did
#not create this column when we ran pd.get_dummies() on the test set.

test = test.drop('Loan_ID', axis=1)
test.insert(8, 'Married_Unknown', 0)


In [6]:
#Use Scikit-Learn's train_test_split function to create train and validation sets.

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8)

In [7]:
#Create a function to run a gradient boosted classifier over our data. The reason we have to
#do this is due to the way Python uses parallelization on Windows.


#Note numerous different values were used in the param_grid to hone in on the best paramater
#combinations. The param grid below is the final one I used.

def model(X_train, X_val, y_train, y_val):
    if __name__ == '__main__':
    
        param_grid = {'learning_rate': [0.03, 0.035],
                      'max_depth': [3, 4, 5],
                      'min_samples_leaf': [17, 18],
                      'max_features': [1.0, 0.95, 0.9],
                      'n_estimators': [100, 300, 500]
                      }

        estimator = GridSearchCV(estimator=GradientBoostingClassifier(),
                                 param_grid=param_grid,
                                 n_jobs=-1)

        estimator.fit(X_train, y_train)

        best_params = estimator.best_params_
                                 
        validation_accuracy = estimator.score(X_val, y_val)
        print('Validation accuracy: ', validation_accuracy)
        
        return best_params

In [8]:
#params that appeared most often after running the model ten times.
    
params = {'min_samples_leaf': 17, 'max_features': 0.95, 'max_depth': 3,
          'learning_rate': 0.03, 'n_estimators': 500}

In [9]:
#Fit model using our data and the best parameters found by GridSearchCV.

model = GradientBoostingClassifier(**params)
model.fit(X, y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.03, loss='deviance', max_depth=3,
              max_features=0.95, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=17,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=500, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [10]:
#Make predictions on the test set.

preds = model.predict(test)

In [13]:
#Create submission file.

preds = pd.Series(preds)
submit = pd.concat([submit, preds], names=['Loan_ID', 'Loan_Status'], axis=1)
submit.columns = ['Loan_ID', 'Loan_Status']

In [15]:
#Create CSV file for submission.

submit.to_csv('finalsub.csv', index=False)

In [None]:
#Here's how the model performed on the leaderboard.
#score 0.798611, position 33/413. 

#Note - model submitted on 09/03/2017.