In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
sns.set(style="darkgrid")
pd.set_option('display.width', 200)
pd.set_option('precision', 4)
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 200)
pd.set_option('display.notebook_repr_html', True)

In [3]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000):
        with pd.option_context("display.max_columns", 1000):
            display(df)

In [4]:
def get_nullframe(dataset):
    null_sum = dataset.isnull().sum()[dataset.isnull().sum() > 0]
    return pd.DataFrame({'feature': null_sum.index, 'count': null_sum.values,
                         'null_percent': (null_sum.values/dataset.shape[0])*100}).sort_values(by=['count'], ascending=False)

In [5]:
train = pd.read_csv("train_u6lujuX_CVtuZ9i.csv")
test = pd.read_csv("test_Y3wMUE5_7gLdaTN.csv")

In [6]:
train_Id = train['Loan_ID']
test_Id = test['Loan_ID']
train_y= train['Loan_Status']

In [7]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [8]:
train['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True)
test['Credit_History'].fillna(test['Credit_History'].mode()[0], inplace=True)
train['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
test['Self_Employed'].fillna(test['Self_Employed'].mode()[0], inplace=True)

train['Dependents'].fillna(train['Dependents'].mode()[0],inplace=True)
test['Dependents'].fillna(test['Dependents'].mode()[0],inplace=True)

train['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].median(),inplace=True) 
test['Loan_Amount_Term'].fillna(test['Loan_Amount_Term'].median(),inplace=True)

train['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
test['Gender'].fillna(test['Gender'].mode()[0], inplace=True)

train['Married'].fillna(train['Gender'].mode()[0], inplace=True)
test['Married'].fillna(test['Gender'].mode()[0], inplace=True)


In [9]:
table = train.pivot_table(values='LoanAmount', index='Self_Employed' ,columns='Education', aggfunc=np.median)
def fage(x):
     return table.loc[x['Self_Employed'],x['Education']]

# Replace missing values
train['LoanAmount'].fillna(train[train['LoanAmount'].isnull()].apply(fage, axis=1), inplace=True)
test['LoanAmount'].fillna(test[test['LoanAmount'].isnull()].apply(fage, axis=1), inplace=True)

In [10]:
train_null =get_nullframe(train)
train_null

Unnamed: 0,feature,count,null_percent


In [11]:
combi = train.append(test, ignore_index=True)
combi['Credit_History']= combi['Credit_History'].astype(str)
combi['Loan_Amount_Term'] =combi['Loan_Amount_Term'].astype(str)
combi['Dependents']=combi['Dependents'].astype(str)
save_loan_id = combi['Loan_ID']
combi.drop(['Loan_ID'],axis=1,inplace=True)

In [12]:
combi.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,Credit_History,Dependents,Education,Gender,LoanAmount,Loan_Amount_Term,Loan_Status,Married,Property_Area,Self_Employed
0,5849,0.0,1.0,0,Graduate,Male,130.0,360.0,Y,No,Urban,No
1,4583,1508.0,1.0,1,Graduate,Male,128.0,360.0,N,Yes,Rural,No
2,3000,0.0,1.0,0,Graduate,Male,66.0,360.0,Y,Yes,Urban,Yes
3,2583,2358.0,1.0,0,Not Graduate,Male,120.0,360.0,Y,Yes,Urban,No
4,6000,0.0,1.0,0,Graduate,Male,141.0,360.0,Y,No,Urban,No


In [13]:
combi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981 entries, 0 to 980
Data columns (total 12 columns):
ApplicantIncome      981 non-null int64
CoapplicantIncome    981 non-null float64
Credit_History       981 non-null object
Dependents           981 non-null object
Education            981 non-null object
Gender               981 non-null object
LoanAmount           981 non-null float64
Loan_Amount_Term     981 non-null object
Loan_Status          614 non-null object
Married              981 non-null object
Property_Area        981 non-null object
Self_Employed        981 non-null object
dtypes: float64(2), int64(1), object(9)
memory usage: 92.0+ KB


In [14]:
import featuretools as ft
es = ft.EntitySet(id = 'loan_prediction')
es.entity_from_dataframe(entity_id = 'data', dataframe = combi, 
                         make_index = True, index = 'index')

Entityset: loan_prediction
  Entities:
    data [Rows: 981, Columns: 13]
  Relationships:
    No relationships

In [35]:
agg_primitives = ['sum', 'max', 'min', 'mean',
                  'count', 'percent_true', 'num_unique', 'mode','count']
trans_primitives = ['percentile',]

In [36]:
feature_matrix, feature_defs = ft.dfs(entityset=es,
                                      target_entity='data', agg_primitives=agg_primitives,
                                      trans_primitives=trans_primitives,
                                      max_depth=1)

In [37]:
feature_matrix.head()

Unnamed: 0_level_0,ApplicantIncome,CoapplicantIncome,Credit_History,Dependents,Education,Gender,LoanAmount,Loan_Amount_Term,Loan_Status,Married,Property_Area,Self_Employed,PERCENTILE(ApplicantIncome),PERCENTILE(CoapplicantIncome),PERCENTILE(LoanAmount)
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,5849,0.0,1.0,0,Graduate,Male,130.0,360.0,Y,No,Urban,No,0.7839,0.2192,0.5336
1,4583,1508.0,1.0,1,Graduate,Male,128.0,360.0,N,Yes,Rural,No,0.6412,0.5576,0.5071
2,3000,0.0,1.0,0,Graduate,Male,66.0,360.0,Y,Yes,Urban,Yes,0.2854,0.2192,0.0683
3,2583,2358.0,1.0,0,Not Graduate,Male,120.0,360.0,Y,Yes,Urban,No,0.1901,0.7482,0.4271
4,6000,0.0,1.0,0,Graduate,Male,141.0,360.0,Y,No,Urban,No,0.7926,0.2192,0.6361


In [38]:
feature_matrix.shape

(981, 15)