In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import ttest_ind, ttest_1samp, chi2
from scipy.stats import chi2_contingency

In [3]:
data = pd.read_csv("loan.csv")

In [4]:
data.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [7]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [10]:
data["Gender"].unique()

array(['Male', 'Female', nan], dtype=object)

In [19]:
data["Education"].unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [8]:
data.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

***
**Que1** We believe that the loan.csv data shows that unmarried men are in a different income group than both married and unmarried women.

To prove this, would a t-test be more appropriate or a chi-square test?

In [18]:
# unmarried men
Income_um = data[(data["Gender"] == "Male") & (data["Married"] == "No")]
Income_um = Income_um["ApplicantIncome"]

# married and unmarried women
merried_w = data[(data["Gender"] == "Female")]
merried_w = merried_w["ApplicantIncome"]
# Ho both are in same income group 
# Ha not in same income group

t_stats, p_value = ttest_ind(Income_um, merried_w)
print(t_stats)
print(p_value)

if p_value < 0.05:
    print("Reject Null Hypothesis")
else:
    print("Fail to reject Ho")



1.1403012035198827
0.25529750307622384
Fail to reject Ho


***
**Que2**We believe that the loan.csv data shows that graduate unmarried men are in a different income group than both married and unmarried graduate women.

To prove this, would a t-test be more appropriate or a chi-square test?

Carry out the test on the 'ApplicantIncome' column for the two groups and report the p-value. Also report your interpretation.

In [21]:
# categorical vs numerical
#graduate unmarried men
G_umarried_m = data[(data["Education"] == "Graduate") & (data["Married"] == "No") & (data["Gender"] == "Male")]
G_umarried_m = G_umarried_m["ApplicantIncome"]
# graduate married and unmarried women
G_umarried_w = data[(data["Education"] == "Graduate") & (data["Gender"] == "Female")]
G_umarried_w = G_umarried_w["ApplicantIncome"]

# Ho both are in same income group 
# Ha not in same income group
# thsi

t_stats, p_value = ttest_ind(G_umarried_m, G_umarried_w)
print(t_stats)
print(p_value)

if p_value < 0.05:
    print("Reject Null Hypothesis")
else:
    print("Fail to reject Ho")

1.75360894634636
0.0811185605889597
Fail to reject Ho


***
**Que3** We believe that the loan.csv data shows that graduate unmarried men are more likely to get a loan than graduate women.
To prove this, would a t-test be more appropriate or a chi-square test?


In [22]:
# categorical vs categorical 

# G_umarried_m = data[(data["Education"] == "Graduate") & (data["Married"] == "No") & (data["Gender"] == "Male")]
# G_umarried_m = G_umarried_m["ApplicantIncome"]
# graduate married and unmarried women
G_umarried_w = data[(data["Education"] == "Graduate") & (data["Gender"] == "Female") & (data["Married"] == "No")]
G_umarried_w = G_umarried_w["ApplicantIncome"]

# Ho Graduate  
# Ha not in same income group
t_stats, p_value = ttest_ind(G_umarried_m, G_umarried_w)
print(t_stats)
print(p_value)

if p_value < 0.05:
    print("Reject Null Hypothesis")
else:
    print("Fail to reject Ho")

2.0823821932264064
0.03886954562508266
Reject Null Hypothesis


***
**Que4** We have been given a dataset containing the details of the people applied for loan.
Which of the following features can be converted from a numerical feature to a categorical feature simply by renaming the values?

In [23]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [29]:
# data.drop(["Loan_ID"], axis=1, inplace=True)
cat_cols = data.dtypes =='object'
cat_cols = list(cat_cols[cat_cols].index)
cat_cols

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Property_Area',
 'Loan_Status']

In [30]:
num_cols = data.dtypes !='object'
num_cols = list(num_cols[num_cols].index)
num_cols

['ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History']

In [33]:
data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [34]:
data["Credit_History"].unique()

array([ 1.,  0., nan])

In [35]:
# Ans Credit_History