In [5]:
import db_utils
import yaml, pandas as pd, numpy as np

'''
Loading the data in the yaml file and saving it as 
a csv file on the local machine
'''

with open('credentials.yaml') as file:
    credentials_dict = yaml.safe_load(file)
    
credentials = db_utils.RDSDatabaseConnector(credentials_dict)
loan_payments = credentials.initialise_database()
credentials.save_file(loan_payments, 'new_file.csv')


'''
Loading the data and adjusting the data types of each column of the dataframe
where needed as well as ensuring the formatting of the data (specifically those 
that consist of dates) are all the same
'''

database = db_utils.load_csv('new_file.csv')
data_transform = db_utils.DataTransform(database)

date_data = ['issue_date', 'earliest_credit_line', 'last_payment_date', 'next_payment_date',
             'last_credit_pull_date']

categorical_data = ['member_id', 'term', 'int_rate', 'grade', 'sub_grade', 'employment_length', 'home_ownership', 'verification_status', 'loan_status', 
                    'payment_plan', 'purpose', 'policy_code', 'application_type']

non_numeric_data = date_data + categorical_data 
column_headings = database.columns.values.tolist()
numeric_data = [column for column in column_headings if column not in non_numeric_data]

for date_column in date_data:
    database, date_column = data_transform.date_data(database, date_column)

for categories in categorical_data:
    database, categories = data_transform.change_type(database, categories, 'category')


'''
Dropping columns whose data consists of more than 50% of null values
and filling in the null values for the other columns with null values
'''

df_info = db_utils.DataFrameInfo(database)
df_transform = db_utils.DataFrameTransform(database)
df_plot = db_utils.Plotter(database)

for column in column_headings:
    null_vals, null_percentage = df_info.missing(database, column)
    if null_percentage > float(50):
        database = database.drop(column, axis=1)

database = database.sort_values(by=['sub_grade'])
database['int_rate'] = database['int_rate'].ffill()
database['funded_amount'] = database.loc[:, 'loan_amount']
fill_values = {'collections_12_mths_ex_med': 0, 'term': database['term'].mode()[0]}
database = df_transform.fill_null(database, values= fill_values)

last_payment_date_list= db_utils.make_list(database, 'last_payment_date')
last_credit_pull_date_list = db_utils.make_list(database, 'last_credit_pull_date')

for index in range(54231):
    if pd.isnull(last_payment_date_list[index]):
        last_payment_date_list[index] = last_credit_pull_date_list[index]
    
    if pd.isnull(last_credit_pull_date_list[index]):
        last_credit_pull_date_list[index] = last_payment_date_list[index]

database['last_payment_date'] = last_payment_date_list
database['last_credit_pull_date'] = last_credit_pull_date_list
database = database.dropna(axis = 0)


'''
Finding the skew of the columns made of integers and floats in the
dataframe
'''

df_skew = database.skew(axis= 0, numeric_only= True)
skewed_data = database[[skewed_col for skewed_col, skew_value in df_skew.items() if skew_value > 2]]

nonskew_data = log_skewed_data = boxcox_skew = yeojohnson_skew = skewed_data.copy()
unskewed_data = database.copy()

log_skewed_data = df_transform.log_transform(log_skewed_data)
yeojohnson_skew = df_transform.yeojohnson_transform(yeojohnson_skew)
boxcox_skew = df_transform.boxcox_transform(boxcox_skew)

log_skew = log_skewed_data.skew()
yeo_skew = yeojohnson_skew.skew()
box_skew = boxcox_skew.skew()

for column in skewed_data.columns:
    if column in boxcox_skew.columns:
        if (abs(box_skew[column]).all() < abs(log_skew[column])) & (abs(box_skew[column]).all() < abs(yeo_skew[column])) & (abs(box_skew[column]).all() < abs(df_skew[column])):
            unskewed_data[column] = nonskew_data[column] = boxcox_skew[column]
    else:
        if (abs(log_skew[column]) < abs(yeo_skew[column])) & (abs(log_skew[column]) < abs(df_skew[column])):
            unskewed_data[column] = nonskew_data[column] = log_skewed_data[column]
        elif (abs(yeo_skew[column]) < abs(log_skew[column])) & (abs(yeo_skew[column]) < abs(df_skew[column])):
            unskewed_data[column] = nonskew_data[column] = yeojohnson_skew[column]
        elif (abs(df_skew[column]).all() < abs(log_skew[column])) & (abs(df_skew[column]) < abs(yeo_skew[column])):
            pass
        else:
            unskewed_data[column] = nonskew_data[column] = yeojohnson_skew[column]    

df2_skew = unskewed_data.skew(axis= 0, numeric_only= True)

'''
Finding and removing outliers in the data.
'''

continuous_data = ['float64', 'int64']

for categories in unskewed_data.columns:
    if unskewed_data.categories.dtype == 'category':
        for distinct_elements in df_info.unique_vals(df2_skew, categories):
            element_count = df2_skew[categories].value_counts()[distinct_elements]
            print(f'{categories}\n {distinct_elements}: {element_count}\n\n')
    elif unskewed_data.categories.dtype in continuous_data:
        df_plot.plot_boxplot(unskewed_data, categories)
        

<class 'pandas.core.frame.DataFrame'>
Index: 54231 entries, 38676116 to 72323
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   member_id                    54231 non-null  category      
 1   loan_amount                  54231 non-null  int64         
 2   funded_amount                51224 non-null  float64       
 3   funded_amount_inv            54231 non-null  float64       
 4   term                         49459 non-null  category      
 5   int_rate                     49062 non-null  category      
 6   instalment                   54231 non-null  float64       
 7   grade                        54231 non-null  category      
 8   sub_grade                    54231 non-null  category      
 9   employment_length            52113 non-null  category      
 10  home_ownership               54231 non-null  category      
 11  annual_inc                   54231 non-