In [1]:
import pandas as pd
from pandas import DataFrame

import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

import seaborn as sns

from scipy import stats
from scipy.stats import norm

# Calling all files needed

In [2]:
#Reading the four csv files 
data_train = pd.read_csv('data_train.csv')
installments_payments = pd.read_csv('installments_payments.csv')
credit_card_balance = pd.read_csv('credit_card_balance.csv')
previous_application = pd.read_csv('previous_application.csv')

# Cleaning data and Merging them together based on IDs

In [3]:
#Remove instances with at least one empty column 
data_train = data_train.dropna(axis=0)
installments_payments = installments_payments.dropna(axis=0)
credit_card_balance = credit_card_balance.dropna(axis=0)
previous_application = previous_application.dropna(axis=0)

In [4]:
#Merge data_train, credit_card_balance
Data_train_credit=pd.merge(data_train,credit_card_balance)

In [5]:
#Merge Data_train_credit with previous_application
data_train_app=pd.merge(Data_train_credit,previous_application, on='SK_ID_CURR')

In [None]:
#Merge data_train_app with installments_payments to create final data
data=pd.merge(data_train_app,installments_payments, on='SK_ID_CURR')

In [None]:
#Find shape of data
data.shape

In [None]:
#Remove previous ID
del data['SK_ID_PREV']

In [None]:
#Keep one information per applicant
data = data.drop_duplicates(subset='SK_ID_CURR', keep="first")

In [None]:
#Find shape of data
data.shape

In [None]:
#Get a glamps of our data
data.head(2)

In [None]:
#Change the index of dataset and make it start from 0 to the end
data.index = range(9390)

In [None]:
#Negetive number doesnt have meaning in our dataset(like days of birth and employment) so make all negetive to positive
data = data.apply(lambda x: x.abs() if np.issubdtype(x.dtype, np.number) else x)

In [None]:
#Check the distribution of TARGET columns
data['TARGET'].value_counts()

In [None]:
#Draw the ratio of each class to whole data
fig, axs = plt.subplots(1,2,figsize=(8,4))
sns.countplot(x='TARGET',data=data,ax=axs[0])
axs[0].set_title("Frequency of each Loan Status")
data.TARGET.value_counts().plot(x=None,y=None, kind='pie', ax=axs[1],autopct='%1.2f%%')
axs[1].set_title("Percentage of each Loan status")
plt.show()

# One-Hot Encoding

In [None]:
#Examine classes of all columns
data.dtypes.value_counts()

In [None]:
#Number of unique classes in each object columns
data.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

In [None]:
#one-hot encoding of categorical variables(make all object columns to numeric)
data = pd.get_dummies(data)
print('Training Features shape: ', data.shape)


In [None]:
#Examine dtypes of data
data.dtypes.value_counts()

In [None]:
data.head(2)

# Dealing with outliear in dataset

In [None]:
data['SK_ID_CURR'] = data['SK_ID_CURR'].astype(str)

In [None]:
#Eximine the destribution of features
data.describe()

In [None]:
#columns that need to be chase for outliears
Outliear_detection=data[[
    'AMT_INCOME_TOTAL','AMT_CREDIT_x','AMT_ANNUITY_x','AMT_GOODS_PRICE_x','DAYS_EMPLOYED','DAYS_BIRTH'
                        , 'AMT_BALANCE','AMT_CREDIT_LIMIT_ACTUAL', 'AMT_TOTAL_RECEIVABLE','AMT_PAYMENT_TOTAL_CURRENT'
                        ]]

In [None]:
#Using zscore to deal with outliears
from scipy import stats
data=data[(np.abs(stats.zscore(Outliear_detection)) <= 3).all(axis=1)]

In [None]:
#Check how many rows exist after deleting outliears
data.shape

In [None]:
#Making target value as a seperate dataframe
data_target=data[['TARGET']]

In [None]:
#Number of approved(0) and rejected(1) applicants
data_target['TARGET'].value_counts()

In [None]:
data.head()

# Extract the whole dataset called data1

In [None]:
data1=data

In [None]:
del data1['SK_ID_CURR']

In [None]:
del data1['TARGET']

In [None]:
#Apply MinMaxScaler to have a uniform dataset

data1_float=data1.values.astype(float)
#Create a min_max processor object
min_max_scaler = preprocessing.MinMaxScaler()

#Create an object to transform the data to fit min_max processor
data1_float_transformed = min_max_scaler.fit_transform(data1_float)

#Save the transformed data as a DataFrame
data1= pd.DataFrame(data1_float_transformed)


In [None]:
#Extract data1 and data_target for the advance use
data1.to_csv (r'C:\Users\User\Desktop\A_Thesis\Dataset\Dataset_CSV\data1.csv', index = False, header=True)
data_target.to_csv (r'C:\Users\User\Desktop\A_Thesis\Dataset\Dataset_CSV\data_target1.csv', index = False, header=True)