In [1]:
# Importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
Test = pd.read_csv("Test_merged.csv")

In [3]:
Test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13165 entries, 0 to 13164
Data columns (total 51 columns):
Unnamed: 0               13165 non-null int64
Agreement_ID             13165 non-null int64
Foreclosure              13165 non-null float64
Customer_ID              13165 non-null int64
MOB                      13165 non-null int64
Loan_Amt                 13165 non-null object
NET_DISBURSED_AMT        13165 non-null object
Interest_Start_Date      13165 non-null object
Current_ROI              13165 non-null float64
Original_ROI             13165 non-null float64
Current_Tenure           13165 non-null float64
Original_Tenure          13165 non-null int64
Due_Day                  13165 non-null int64
Authorization_Date       13165 non-null object
City                     13165 non-null object
Pre_EMI_Due_Amt          13165 non-null float64
Pre_EMI_Received_Amt     13165 non-null float64
PRE_EMI_OS_AMOUNT        13165 non-null float64
EMI_Due_Amt              13165 non-null floa

In [4]:
from sklearn import preprocessing


# encode categorical variables using Label Encoder

# select all categorical variables
df_categorical = Test.select_dtypes(include=['object'])
df_categorical.head()

Unnamed: 0,Loan_Amt,NET_DISBURSED_AMT,Interest_Start_Date,Authorization_Date,City,Last_Receipt_Date,Product,NPA_In_Last_Month,NPA_In_Current_Month,Gender,Marital_Status,Qualification
0,4053498.66,4053498.66,06-Sep-10,06-Sep-10,MUMBAI,05-Dec-17,HL,,,M,M,GRAD
1,13257031.92,13257031.92,29-Sep-10,29-Sep-10,MUMBAI,05-Nov-13,HL,,,M,M,GRAD
2,13211403.04,13211403.04,06-Sep-10,06-Sep-10,MUMBAI,05-Jul-14,HL,,,M,M,GRAD
3,6155312.78,6155312.78,11-Oct-10,11-Oct-10,MUMBAI,05-Aug-14,HL,,,M,M,UG
4,9499284.03,9499284.03,10-Nov-10,05-Nov-10,MUMBAI,11-Jun-14,LAP,,,M,M,GRAD


In [5]:
TestX_merged = pd.read_csv("TestX_merged.csv")

In [6]:
df_categorical = df_categorical[['City','Product','Gender','Marital_Status','Qualification']]

In [7]:
df_categorical.Gender = df_categorical.Gender.astype('str')

In [8]:
df_categorical.Marital_Status = df_categorical.Marital_Status.astype('str')

In [9]:
df_categorical.Qualification = df_categorical.Qualification.astype('str')

In [10]:
# apply Label encoder to df_categorical

le = preprocessing.LabelEncoder()
df_categorical = df_categorical.apply(le.fit_transform)
df_categorical.head()

Unnamed: 0,City,Product,Gender,Marital_Status,Qualification
0,134,0,1,0,2
1,134,0,1,0,2
2,134,0,1,0,2
3,134,0,1,0,7
4,134,1,1,0,2


In [11]:
Test_data = pd.concat([TestX_merged, df_categorical],axis = 1)

In [12]:
Test_data = Test_data.iloc[:,1:]

In [13]:
Test_data.head(100)

Unnamed: 0,Net_Disbursed_Amt,ROI_Change_Ratio,Tenure_Ratio,Tenure_Change_Ratio,Foreclosure,Net_LTV,City,Product,Gender,Marital_Status,Qualification
0,40.53,-0.32,0.28,-0.73,0.086017,79.34,134,0,1,0,2
1,132.57,-0.43,0.19,-0.10,0.062022,86.86,134,0,1,0,2
2,132.11,-0.39,0.32,0.21,0.122249,40.93,134,0,1,0,2
3,61.55,-0.32,0.14,-0.46,0.077855,45.32,134,0,1,0,7
4,94.99,-0.39,0.79,0.71,0.371909,52.34,134,1,1,0,2
5,54.01,-0.21,0.27,-0.19,0.241502,84.15,193,0,1,0,2
6,43.09,-0.34,0.14,-0.69,0.058800,70.65,134,1,1,0,5
7,76.89,-0.39,1.00,0.67,0.462677,34.78,134,0,1,0,7
8,172.52,-0.26,0.27,-0.30,0.158624,22.22,134,0,1,0,2
9,113.75,0.05,0.75,-0.07,0.846144,54.29,134,1,1,0,2


In [14]:
import pickle
filename = 'edelweiss_random_forests_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))
y_pred = loaded_model.predict_proba(Test_data[['Net_Disbursed_Amt','ROI_Change_Ratio','Tenure_Ratio',
'Tenure_Change_Ratio','Net_LTV','City','Product','Gender','Marital_Status','Qualification']])

In [15]:
# Converting y_pred to a dataframe which is an array
y_pred_df = pd.DataFrame(y_pred)

In [16]:
# Converting to column dataframe
y_pred_1 = y_pred_df.iloc[:,[1]]

In [17]:
# Let's see the head
y_pred_1.describe()

Unnamed: 0,1
count,13165.0
mean,0.09087
std,0.163927
min,0.0
25%,0.003209
50%,0.016602
75%,0.08508
max,0.833198


In [18]:
# Converting y_test to dataframe
y_test_df = pd.DataFrame(Test['Agreement_ID'])

In [19]:
y_test_df

Unnamed: 0,Agreement_ID
0,11220003
1,11220004
2,11220005
3,11220007
4,11220009
5,11220013
6,11220015
7,11220018
8,11220019
9,11220023


In [20]:
# Removing index for both dataframes to append them side by side 
y_pred_1.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

In [21]:
# Appending y_test_df and y_pred_1
y_pred_final = pd.concat([y_test_df,y_pred_1],axis=1)

In [22]:
# Renaming the column 
y_pred_final= y_pred_final.rename(columns={ 1 : 'Foreclosure_Prob'})

In [23]:
# Rearranging the columns
y_pred_final = y_pred_final.reindex_axis(['Agreement_ID','Foreclosure','Foreclosure_Prob'], axis=1)

In [24]:
# Creating new column 'predicted' with 1 if Churn_Prob>0.5 else 0
y_pred_final['predicted'] = y_pred_final.Foreclosure_Prob.map( lambda x: 1 if x > 0.5 else 0)

In [25]:
y_pred_final = y_pred_final[['Agreement_ID', 'Foreclosure_Prob']]

In [26]:
# Let's see the head of y_pred_final
y_pred_final.head(20)

Unnamed: 0,Agreement_ID,Foreclosure_Prob
0,11220003,0.43991
1,11220004,0.548903
2,11220005,0.540681
3,11220007,0.63991
4,11220009,0.047519
5,11220013,0.450658
6,11220015,0.046048
7,11220018,0.536243
8,11220019,0.459155
9,11220023,0.029221


In [27]:
y_pred_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13165 entries, 0 to 13164
Data columns (total 2 columns):
Agreement_ID        13165 non-null int64
Foreclosure_Prob    13165 non-null float64
dtypes: float64(1), int64(1)
memory usage: 205.8 KB


In [28]:
y_pred_final = y_pred_final.rename(columns={"Agreement_ID":"AGREEMENTID"})

In [29]:
test_final = pd.read_csv("test_foreclosure.csv")

In [30]:
test_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13342 entries, 0 to 13341
Data columns (total 2 columns):
AGREEMENTID    13342 non-null int64
FORECLOSURE    13165 non-null float64
dtypes: float64(1), int64(1)
memory usage: 208.5 KB


In [31]:
test = pd.merge(test_final, y_pred_final, how='left', on='AGREEMENTID')

In [32]:
test['FORECLOSURE'] =test['Foreclosure_Prob']

In [33]:
test = test.drop('Foreclosure_Prob',axis = 1)

In [34]:
test.to_csv('random_forests_prediction.csv')