In [None]:
#import packages

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

In [None]:
#Read data

In [None]:
app_train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')

In [None]:
app_train.head()

In [None]:
app_test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
app_test.head()

In [None]:
#Check out if the dataset is imbalanced

In [None]:
app_train['TARGET'].value_counts()

In [None]:
282686/(282686 + 24825)

In [None]:
# From here, we could tell most of the people repaid there loan on time, we need to weigh the classes after

In [None]:
# Check for missing values

In [None]:
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [None]:
missing_values = missing_values_table(app_train)
missing_values.head(25)

In [None]:
# checking for categorical value and encoding them

In [None]:
app_train.dtypes.value_counts()

In [None]:
app_train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)


In [None]:
# Encoding
le = LabelEncoder()

for col in app_train:
    if app_train[col].dtype == 'object':
        if len(list(app_train[col].unique())) <= 2:
            le.fit(app_train[col])
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            

In [None]:
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

In [None]:
# Remove the columns in the training data that are not in the testing data

In [None]:
train_labels = app_train['TARGET']

app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

app_train['TARGET'] = train_labels

In [None]:
# Check if there's any problem with people's age

In [None]:
(app_train['DAYS_BIRTH'] / -365).describe()

In [None]:
# Ages look good, no problem.

In [None]:
# Check if there's any problem with days of employment

In [None]:
app_train['DAYS_EMPLOYED'].describe()

In [None]:
# Days of Employed got some problems, the maximun days of employed is about 1000 years.

In [None]:
app_train['DAYS_EMPLOYED'].plot.hist(title = 'Days Emplyment')
plt.xlabel('Days')

In [None]:
# We can set the 1000 years of days employted to Missing value

In [None]:
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243


app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

app_train['DAYS_EMPLOYED'].plot.hist(title = 'Days Employment Histogram');
plt.xlabel('Days Employment');

In [None]:
app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243
app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)


In [None]:
# Check out the correlations with the target

In [None]:
correlations = app_train.corr()['TARGET'].sort_values()

In [None]:
correlations.head(20)

In [None]:
correlations.tail(20)

In [None]:
# The data in column  days_birth is all negative, we need to take the absolute value of those data, and check the correlation again.

In [None]:
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_train['DAYS_BIRTH'].corr(app_train['TARGET'])

In [None]:
# Now the correlation is -0.078, that means as client gets older, they repay their loan more on time more.

In [None]:
plt.hist(app_train['DAYS_BIRTH'] / 365, edgecolor = 'k', bins = 25)
plt.title('Clients Ages'); plt.xlabel('Age'); plt.ylabel('Count');

In [None]:
plt.figure(figsize = (10, 8))

sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, 'DAYS_BIRTH'] / 365, label = 'target == 0')

sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, 'DAYS_BIRTH'] / 365, label = 'target == 1')

plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages')
plt.legend();

In [None]:
# Through the graph, we could tell that target = 1 's distribution is skewed to the left. 

In [None]:
# Let's see the average failure to repay loans by ages.

In [None]:
age_data = app_train[['TARGET', 'DAYS_BIRTH']]
age_data['YEARS_BIRTH'] = age_data['DAYS_BIRTH'] / 365


age_data['YEARS_BINNED'] = pd.cut(age_data['YEARS_BIRTH'], bins = np.linspace(20, 70, num = 11))
age_data.head(10)

In [None]:
age_groups  = age_data.groupby('YEARS_BINNED').mean()
age_groups

In [None]:
plt.figure(figsize = (8, 8))

plt.bar(age_groups.index.astype(str), 100 * age_groups['TARGET'])

plt.xticks(rotation = 75); plt.xlabel('Age Group (years)'); plt.ylabel('Failure to Repay (%)')
plt.title('Failure to Repay by Age Group');

In [None]:
# It's obvious that younger people don't repay their loan on time， and it's like a common sense.

In [None]:
#EXT_SOURCE_3 EXT_SOURCE_2 EXT_SOURCE_1 are the most negative correlations with the target.
#Let's check out these variables.

In [None]:
ext_source = app_train[['TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]
ext_source_corrs = ext_source.corr()
ext_source_corrs
sns.heatmap(ext_source_corrs, cmap = plt.cm.RdYlBu_r, vmin = -0.25, annot = True, vmax = 0.6)
plt.title('Correlation Heatmap');

In [None]:
# All of Ext_Sources are negative correlations with the target, and ext_source_1 and days_birth is positively correlated.

In [None]:
# Logistic Regression

In [None]:
train = app_train.drop(columns = ['TARGET'])

features = list(train.columns)

test = app_test.copy()

imputer = SimpleImputer(strategy = 'median')

scaler = MinMaxScaler(feature_range = (0, 1))

imputer.fit(train)

train = imputer.transform(train)
test = imputer.transform(app_test)

scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(C = 0.0001)

log_reg.fit(train, train_labels)

In [None]:
log_reg_pred = log_reg.predict_proba(test)[:, 1]


In [None]:
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = log_reg_pred

submit.head(20)