In [2]:
import db_utils
import yaml, pandas as pd, numpy as np

'''
Loading the data in the yaml file and saving it as 
a csv file on the local machine
'''

with open('credentials.yaml') as file:
    credentials_dict = yaml.safe_load(file)
    
credentials = db_utils.RDSDatabaseConnector(credentials_dict)
loan_payments = credentials.initialise_database()
credentials.save_file(loan_payments, 'new_file.csv')


'''
Loading the data and adjusting the data types of each column of the dataframe
where needed as well as ensuring the formatting of the data (specifically those 
that consist of dates) are all the same
'''

database = db_utils.load_csv('new_file.csv')
cleaned_data = db_utils.DataTransform(database)

date_data = ['issue_date', 'earliest_credit_line', 'last_payment_date', 'next_payment_date',
             'last_credit_pull_date']

categorical_data = ['member_id', 'term', 'int_rate', 'grade', 'sub_grade', 'employment_length', 'home_ownership', 'verification_status', 'loan_status', 
                    'payment_plan', 'purpose', 'policy_code', 'application_type']

non_numeric_data = date_data + categorical_data 
column_headings = database.columns.values.tolist()
numeric_data = [column for column in column_headings if column not in non_numeric_data]

for date_column in date_data:
    database, date_column = cleaned_data.date_data(database, date_column)

for categories in categorical_data:
    database, categories = cleaned_data.change_type(database, categories, 'category')


'''
Dropping columns whose data consists of more than 50% of null values
and printing out the columns that were not dropped that have null values 
present
'''

df_info = db_utils.DataFrameInfo(database)

for column in column_headings:
    null_vals, null_percentage = df_info.missing(database, column)
    null_percentage = round(null_percentage, 2)
    if null_percentage > float(60):
        database = database.drop(column, axis=1)
        #print(f'{column}: {null_vals} null values, {null_percentage}%')
    elif float(0) < null_percentage < float(60):
        print(f'{column}: {null_vals} null values, {null_percentage}%')


'''
Filling in null values within the dataframe either by imputing, 
using other columns to compare the data or replacing with another
value
'''

funded_amount_column = db_utils.make_list(database, 'funded_amount')
loan_amount_column = db_utils.make_list(database, 'loan_amount')
last_payment_date_list= db_utils.make_list(database, 'last_payment_date')
last_credit_pull_date_list = db_utils.make_list(database, 'last_credit_pull_date')

database = database.sort_values(by=['sub_grade'])
database['int_rate'] = database['int_rate'].ffill()

for index in range(54231):
    if np.isnan(funded_amount_column[index]):
        funded_amount_column[index] = loan_amount_column[index]

    if pd.isnull(last_payment_date_list[index]):
        last_payment_date_list[index] = last_credit_pull_date_list[index]
    
    if pd.isnull(last_credit_pull_date_list[index]):
        last_credit_pull_date_list[index] = last_payment_date_list[index]

database['funded_amount'] = funded_amount_column
database['last_payment_date'] = last_payment_date_list
database['last_credit_pull_date'] = last_credit_pull_date_list

database = database.fillna(value = {'collections_12_mths_ex_med': 0, 'mths_since_last_delinq':0})

print(df_info.info(database))




mths_since_last_delinq: 31002 null values, 57.17%
mths_since_last_record: 48050 null values, 88.6%
next_payment_date: 32608 null values, 60.13%
mths_since_last_major_derog: 46732 null values, 86.17%
