In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
df = pd.read_csv("IS453 Group Assignment - Data.csv")
df

In [None]:
# Filtering based on Married, Does not own property, and below the age of 35

df["AGE"] = -df["DAYS_BIRTH"]/365
df2 = df[(df["FLAG_OWN_REALTY"] == "N") & (df["NAME_FAMILY_STATUS"] == "Married") & (df["AGE"] <= 35)]
df2

In [None]:
# Selecting only needed features

final_df = pd.DataFrame()
final_df["AGE"] = df2["AGE"] # Time relative to the application
final_df["STATUS"] = df2["STATUS"]
final_df["FLAG_OWN_CAR"] = df2["FLAG_OWN_CAR"]
# final_df["ORGANIZATION_TYPE"] = df2["ORGANIZATION_TYPE"]
final_df["NAME_FAMILY_STATUS"] = df2["NAME_FAMILY_STATUS"]
final_df["NAME_INCOME_TYPE"] = df2["NAME_INCOME_TYPE"]
final_df["CNT_CHILDREN"] = df2["CNT_CHILDREN"]
final_df["DAYS_EMPLOYED"] = df2["DAYS_EMPLOYED"] # Time relative to the application
final_df["AMT_INCOME_TOTAL"] = df2["AMT_INCOME_TOTAL"]
# final_df["NAME_TYPE_SUITE"] = df2["NAME_TYPE_SUITE"]
final_df["NAME_EDUCATION_TYPE"] = df2["NAME_EDUCATION_TYPE"]
final_df["OCCUPATION_TYPE"] = df2["OCCUPATION_TYPE"]
final_df["CNT_FAM_MEMBERS"] = df2["CNT_FAM_MEMBERS"]
final_df["AMT_CREDIT"] = df2["AMT_CREDIT"]
final_df["AMT_ANNUITY"] = df2["AMT_ANNUITY"]

# Maybe can include (Determinant factor as maybe friend group might be influencing this behavior? As they are below 35)
final_df["OBS_30_CNT_SOCIAL_CIRCLE"] = df2["OBS_30_CNT_SOCIAL_CIRCLE"]
final_df["DEF_30_CNT_SOCIAL_CIRCLE"] = df2["DEF_30_CNT_SOCIAL_CIRCLE"]
final_df["OBS_60_CNT_SOCIAL_CIRCLE"] = df2["OBS_60_CNT_SOCIAL_CIRCLE"]
final_df["DEF_60_CNT_SOCIAL_CIRCLE"] = df2["DEF_60_CNT_SOCIAL_CIRCLE"]

In [None]:
# Check any missing rows

col_w_na = pd.DataFrame((final_df.isnull().sum().sort_values(ascending=False)/final_df.shape[0])).applymap("{0:.3%}".format)

print(col_w_na[0].sort_values().tail(10))
print()
final_df.info()

In [None]:
# Analysis of missing rows

final_df[final_df["OCCUPATION_TYPE"].isna()]

In [None]:
# Analysis of missing rows

print("Missing number of rows: " + str(len(final_df[final_df["OBS_30_CNT_SOCIAL_CIRCLE"].isna()])))

final_df[final_df["OBS_30_CNT_SOCIAL_CIRCLE"].isna()]

In [None]:
# Analysis of missing rows

print((final_df["AMT_CREDIT"]/final_df["AMT_ANNUITY"]).describe())
print()
missing_amt_annunity_index_list = final_df[final_df["AMT_ANNUITY"].isna()].index.values

final_df[final_df["AMT_ANNUITY"].isna()]

In [None]:
# Filling missing values

final_df["OCCUPATION_TYPE"] = final_df["OCCUPATION_TYPE"].fillna("Missing")

# For now put it as 0? 
final_df['OBS_30_CNT_SOCIAL_CIRCLE'].fillna(value = 0, inplace = True)
final_df['DEF_30_CNT_SOCIAL_CIRCLE'].fillna(value = 0, inplace = True)
final_df['OBS_60_CNT_SOCIAL_CIRCLE'].fillna(value = 0, inplace = True)
final_df['DEF_60_CNT_SOCIAL_CIRCLE'].fillna(value = 0, inplace = True)

# Take the median quotient of amt_credit/amt_annunity as divisor fills for amt_annunity(debatable?) 
final_df['AMT_ANNUITY'].fillna(value = final_df['AMT_ANNUITY'], inplace = True)
AMT_ANNUITY_divisor_median = (final_df["AMT_CREDIT"]/final_df["AMT_ANNUITY"]).median() 

final_df["AMT_ANNUITY"] = final_df["AMT_ANNUITY"].fillna(final_df["AMT_CREDIT"]/AMT_ANNUITY_divisor_median)

final_df.info()
final_df.loc[missing_amt_annunity_index_list]

In [None]:
# Checking for any outliers

final_df.max()

In [None]:
# final_df[(final_df["DAYS_EMPLOYED"] < 365243) & (final_df["DAYS_EMPLOYED"] > 0)]
# final_df[final_df["NAME_INCOME_TYPE"] == "Pensioner"].info()

# Outlier Analysis (365243 days employed are people who are pensioner[retired] or unemployed)

final_df[final_df["DAYS_EMPLOYED"] == 365243].info()

In [None]:
# Check correlations (Might need to do hot encoding to see all correlations and do logistic regression)

plt.figure(figsize=(16, 5))
sns.heatmap(final_df.corr(), annot=True);

In [None]:
# Feature Selection 

final_df = final_df.drop(columns=["CNT_FAM_MEMBERS", "OBS_60_CNT_SOCIAL_CIRCLE", "DEF_60_CNT_SOCIAL_CIRCLE" ])
plt.figure(figsize=(16, 5))
sns.heatmap(final_df.corr(), annot=True);

In [None]:
# Feature extraction 

# Combining Loan Annunity (Monthly repayments) to Income (Annual income?)

final_df["debt-to-income"] = final_df["AMT_ANNUITY"]/(final_df["AMT_INCOME_TOTAL"]/12)
final_df = final_df.drop(columns=["AMT_ANNUITY", "AMT_INCOME_TOTAL"])

plt.figure(figsize=(16, 5))
sns.heatmap(final_df.corr(), annot=True);

In [None]:
# Hot encoding

final_df = pd.get_dummies(final_df,columns=['FLAG_OWN_CAR', "NAME_FAMILY_STATUS", "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", "OCCUPATION_TYPE"])

In [None]:
final_df.corr().describe()

In [None]:
# Logistic Regression
X = final_df.drop('STATUS' , axis = 1)
y = final_df['STATUS']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100, stratify = y)

print("Number of rows in original data: ", final_df.shape[0])
print("Number of rows in training data: ", X_train.shape[0])
print("Number of rows in test data:     ", X_test.shape[0])
print("\nPercentage of bads in original data: ", "{:.2%}".format(y.sum()/y.shape[0]))
print("Percentage of bads in train data:    ", "{:.2%}".format(y_train.sum()/y_train.shape[0]))
print("Percentage of bads in test data:     ", "{:.2%}".format(y_test.sum()/y_test.shape[0]))

In [None]:
# create a logistic regression model and fit the training data

logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train, y_train)

# print out the intercept and coeeficients
coeff = logreg.coef_.reshape(-1)
df_coeff = pd.DataFrame({
    'Variable'    : X_train.columns,
    'coefficient' : (coeff)})
print("Intercept: ", np.round(logreg.intercept_[0],6))
df_coeff

In [None]:
# check the accuracy

y_pred = logreg.predict(X_test)
print('Testing accuracy is %s' % np.round(accuracy_score(y_test, y_pred),3))