In [11]:
import db_utils
import yaml, pandas as pd, numpy as np, plotly.express as px
from scipy import stats

'''
Loading the data in the yaml file and saving it as 
a csv file on the local machine
'''

with open('credentials.yaml') as file:
    credentials_dict = yaml.safe_load(file)
    
credentials = db_utils.RDSDatabaseConnector(credentials_dict)
loan_payments = credentials.initialise_database()
credentials.save_file(loan_payments, 'new_file.csv')


'''
Loading the data and adjusting the data types of each column of the dataframe
where needed as well as ensuring the formatting of the data (specifically those 
that consist of dates) are all the same
'''

database = db_utils.load_csv('new_file.csv')
data_transform = db_utils.DataTransform(database)

date_data = ['issue_date', 'earliest_credit_line', 'last_payment_date', 'next_payment_date',
             'last_credit_pull_date']

categorical_data = ['member_id', 'term', 'int_rate', 'grade', 'sub_grade', 'employment_length', 'home_ownership', 'verification_status', 'loan_status', 
                    'payment_plan', 'purpose', 'policy_code', 'application_type']

non_numeric_data = date_data + categorical_data 
column_headings = database.columns.values.tolist()
numeric_data = [column for column in column_headings if column not in non_numeric_data]

for date_column in date_data:
    database, date_column = data_transform.date_data(database, date_column)

for categories in categorical_data:
    database, categories = data_transform.change_type(database, categories, 'category')


'''
Dropping columns whose data consists of more than 50% of null values
and filling in the null values for the other columns with null values
'''

df_info = db_utils.DataFrameInfo(database)
df_transform = db_utils.DataFrameTransform(database)
df_plot = db_utils.Plotter(database)

for column in column_headings:
    null_vals, null_percentage = df_info.missing(database, column)
    if null_percentage > float(50):
        database = database.drop(column, axis=1)

database = database.sort_values(by=['sub_grade'])
database['int_rate'] = database['int_rate'].ffill()
database['funded_amount'] = database.loc[:, 'loan_amount']
fill_values = {'collections_12_mths_ex_med': 0, 'term': database['term'].mode()[0]}
database = df_transform.fill_null(database, values= fill_values)

last_payment_date_list= db_utils.make_list(database, 'last_payment_date')
last_credit_pull_date_list = db_utils.make_list(database, 'last_credit_pull_date')

for index in range(54231):
    if pd.isnull(last_payment_date_list[index]):
        last_payment_date_list[index] = last_credit_pull_date_list[index]
    
    if pd.isnull(last_credit_pull_date_list[index]):
        last_credit_pull_date_list[index] = last_payment_date_list[index]

database['last_payment_date'] = last_payment_date_list
database['last_credit_pull_date'] = last_credit_pull_date_list
database = database.dropna(axis = 0)


'''
Finding the skew of the columns made of integers and floats in the
dataframe
'''

unskewed_data = database.copy()

original_skews = df_info.df_skew(database)
skewed_data = database[[cols for cols, skews in original_skews.items() if skews > 2]]
log_skewed_data = yeojohnson_skew = boxcox_skew = skewed_data.copy()

log_skewed_data = df_transform.log_transform(log_skewed_data)
yeojohnson_skew = df_transform.yeojohnson_transform(yeojohnson_skew)
boxcox_skew = df_transform.boxcox_transform(boxcox_skew)

log_skew = df_info.df_skew(log_skewed_data)
yeo_skew = df_info.df_skew(yeojohnson_skew)
box_skew = df_info.df_skew(boxcox_skew)

for column in skewed_data.columns:
    col_skew = abs(original_skews.get(column))
    log_col_skew = abs(log_skew[column])
    yeo_col_skew = abs(yeo_skew[column])
    
    smallest_skew = min(log_col_skew, yeo_col_skew, col_skew)

    if column in boxcox_skew.columns:
        box_col_skew = abs(box_skew[column]).all()
        
        if box_col_skew <= smallest_skew:
            unskewed_data.loc[:, column] = boxcox_skew[column].copy()

    if smallest_skew == log_col_skew or smallest_skew == log_col_skew == yeo_col_skew:
        unskewed_data.loc[:, column] = log_skewed_data[column].copy()
    elif smallest_skew == yeo_col_skew:
        unskewed_data.loc[:, column] = yeojohnson_skew[column].copy()

'''
Finding and removing outliers in the data.
'''

for categories in unskewed_data.columns:
    if unskewed_data.dtypes[categories] in ['float64', 'int64']:
        
        q1 = unskewed_data[categories].quantile(0.25)
        q3 = unskewed_data[categories].quantile(0.75)
        iqr = q3 - q1
        unskewed_data = unskewed_data[~((unskewed_data[categories]<(q1-1.5*iqr)) | (unskewed_data[categories]>(q3+1.5*iqr)))]
        unskewed_data = unskewed_data.dropna().reset_index(drop=True)
        z = np.abs(stats.zscore(unskewed_data[categories]))
        unskewed_data = unskewed_data[np.abs(stats.zscore(unskewed_data[categories])) < 3]
        #df_plot.plot_boxplot(unskewed_data, categories)

fig = px.imshow(unskewed_data.corr(), title = 'Correlation heatmap of data')

loan_amount: 0.7977492711809002
funded_amount: 0.7977492711809002
funded_amount_inv: 0.8055737930190515
instalment: 0.9894926256453928
annual_inc: 8.766706946736596
dti: 0.19475459150210536
delinq_2yrs: 5.3302689023540095
inq_last_6mths: 3.2843306955572724
open_accounts: 1.0582172401425745
total_accounts: 0.7772746956178468
out_prncp: 2.358972868475207
out_prncp_inv: 2.3594154508797995
total_payment: 1.2572156379380994
total_payment_inv: 1.2449396374171213
total_rec_prncp: 1.248463242485878
total_rec_int: 2.1940362256545747
total_rec_late_fee: 13.18644093613826
recoveries: 14.133928797708249
collection_recovery_fee: 27.505679057194968
last_payment_amount: 2.478841250388088
collections_12_mths_ex_med: 20.385589060161674



loan_amount: 0.7977492711809002
funded_amount: 0.7977492711809002
funded_amount_inv: 0.8055737930190515
instalment: 0.9894926256453928
annual_inc: 0.001000646505650805
dti: 0.19475459150210536
delinq_2yrs: 4.173396558562514
inq_last_6mths: 1.3205420452349328
open_acco