# preprocessing of data set

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv")
test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv")

In [2]:
#Combining both train and test dataset

train['Type']='Train' #Create a flag for Train and Test Data set
test['Type']='Test'
fullData = pd.concat([train,test],axis=0)

#Look at the available missing values in the dataset
fullData.isnull().sum()

Loan_ID                0
Gender                24
Married                3
Dependents            25
Education              0
Self_Employed         55
ApplicantIncome        0
CoapplicantIncome      0
LoanAmount            27
Loan_Amount_Term      20
Credit_History        79
Property_Area          0
Loan_Status          367
Type                   0
dtype: int64

In [3]:
fullData['LoanAmount'].fillna(fullData['LoanAmount'].mean(),inplace=True)
fullData['ApplicantIncome'].fillna(fullData['ApplicantIncome'].mean(),inplace=True)
fullData['Loan_Amount_Term'].fillna(fullData['Loan_Amount_Term'].mean(),inplace=True)
fullData['CoapplicantIncome'].fillna(fullData['CoapplicantIncome'].mean(),inplace=True)

fullData['Credit_History'].fillna(fullData['Credit_History'].mode().values[0],inplace=True)
fullData['Gender'].fillna(fullData['Gender'].mode().values[0],inplace=True)
fullData['Self_Employed'].fillna(fullData['Self_Employed'].mode().values[0],inplace=True)
fullData['Dependents'].fillna(fullData['Dependents'].mode().values[0],inplace=True)
fullData['Married'].fillna(fullData['Married'].mode().values[0],inplace=True)

In [4]:
#Look at the available missing values in the dataset
fullData.isnull().sum()

Loan_ID                0
Gender                 0
Married                0
Dependents             0
Education              0
Self_Employed          0
ApplicantIncome        0
CoapplicantIncome      0
LoanAmount             0
Loan_Amount_Term       0
Credit_History         0
Property_Area          0
Loan_Status          367
Type                   0
dtype: int64

In [5]:
#Identify categorical and continuous variables
ID_col = ['Loan_ID']
target_col = ["Loan_Status"]
cat_cols = ['Credit_History','Dependents','Gender','Married','Education','Property_Area','Self_Employed']

other_col=['Type'] #Test and Train Data set identifier
num_cols= list(set(list(fullData.columns))-set(cat_cols)-set(ID_col)-set(target_col)-set(other_col))

In [6]:
#SKIP due to error
#Imputing Missing values with mean for continuous variable
#fullData[num_cols] = fullData[num_cols].fillna(fullData[num_cols].mean(),inplace=True)

In [7]:
#SKIP due to error
#Imputing Missing values with mode for categorical variables
#cat_imput=pd.Series(fullData[cat_cols].mode().values[0])
#cat_imput.index=cat_cols
#fullData[cat_cols] = fullData[cat_cols].fillna(cat_imput,inplace=True)

In [8]:
#Create a new column as Total Income

fullData['TotalIncome']=fullData['ApplicantIncome']+fullData['CoapplicantIncome']

#Take a log of TotalIncome + 1, adding 1 to deal with zeros of TotalIncome it it exists
fullData['Log_TotalIncome']=np.log(fullData['TotalIncome'])


In [19]:
#create label encoders for categorical features
for var in cat_cols:
    number = LabelEncoder()
    fullData[var] = number.fit_transform(fullData[var].astype('str'))

train_modified=fullData[fullData['Type']=='Train']
test_modified=fullData[fullData['Type']=='Test']
train_modified["Loan_Status"] = number.fit_transform(train_modified["Loan_Status"].astype('str'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_modified["Loan_Status"] = number.fit_transform(train_modified["Loan_Status"].astype('str'))


# Building Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression


predictors=['Credit_History','Education','Gender']

x_train = train_modified[list(predictors)].values
y_train = train_modified["Loan_Status"].values

x_test=test_modified[list(predictors)].values

In [11]:
# Create logistic regression object
model = LogisticRegression()

# Train the model using the training sets
model.fit(x_train, y_train)

#Predict Output
predicted= model.predict(x_test)

#Reverse encoding for predicted outcome
predicted = number.inverse_transform(predicted)

#Store it to test dataset
test_modified['Loan_Status']=predicted

#Output file to make submission
test_modified.to_csv("Submission1.csv",columns=['Loan_ID','Loan_Status'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_modified['Loan_Status']=predicted


# Building Decision Tree Classifier

In [12]:
predictors=['Credit_History','Education','Gender']

x_train = train_modified[list(predictors)].values
y_train = train_modified["Loan_Status"].values

x_test=test_modified[list(predictors)].values

In [13]:
from sklearn.tree import DecisionTreeClassifier

# Create Decision Tree object
model = DecisionTreeClassifier()

# Train the model using the training sets
model.fit(x_train, y_train)

#Predict Output
predicted= model.predict(x_test)

#Reverse encoding for predicted outcome
predicted = number.inverse_transform(predicted)

#Store it to test dataset
test_modified['Loan_Status']=predicted

#Output file to make submission
test_modified.to_csv("Submission2.csv",columns=['Loan_ID','Loan_Status'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_modified['Loan_Status']=predicted


# Building Random Forest Classifier

In [14]:
from sklearn.linear_model import LogisticRegression


predictors=['ApplicantIncome', 'CoapplicantIncome', 'Credit_History','Dependents', 'Education', 'Gender', 'LoanAmount',
            'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed', 'TotalIncome','Log_TotalIncome']

x_train = train_modified[list(predictors)].values
y_train = train_modified["Loan_Status"].values

x_test=test_modified[list(predictors)].values

In [15]:
from sklearn.ensemble import RandomForestClassifier

# Create Decision Tree object
model = RandomForestClassifier()

# Train the model using the training sets
model.fit(x_train, y_train)

#Predict Output
predicted= model.predict(x_test)

#Reverse encoding for predicted outcome
predicted = number.inverse_transform(predicted)

#Store it to test dataset
test_modified['Loan_Status']=predicted

#Output file to make submission
test_modified.to_csv("Submission3.csv",columns=['Loan_ID','Loan_Status'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_modified['Loan_Status']=predicted


In [17]:
#Create a series with feature importances:
featimp = pd.Series(model.feature_importances_, index=predictors).sort_values(ascending=False)
print(featimp)

Credit_History       0.255369
Log_TotalIncome      0.131075
ApplicantIncome      0.129391
LoanAmount           0.122530
TotalIncome          0.119982
CoapplicantIncome    0.072647
Loan_Amount_Term     0.035951
Dependents           0.035740
Property_Area        0.033628
Married              0.018832
Education            0.017213
Gender               0.014338
Self_Employed        0.013306
dtype: float64


In [None]:
number = LabelEncoder()
train['Gender'] = number.fit_transform(train['Gender'].astype('str'))

In [None]:
train.Gender