In [7]:
import db_utils
import yaml, pandas as pd, numpy as np

with open('credentials.yaml') as file:
    credentials_dict = yaml.safe_load(file)
    
credentials = db_utils.RDSDatabaseConnector(credentials_dict)
loan_payments = credentials.initialise_database()

credentials.save_file(loan_payments, 'new_file.csv')
database = pd.read_csv('new_file.csv', index_col = 'id')

cleaned_data = db_utils.DataTransform(database)

date_data = ['issue_date', 'earliest_credit_line', 'last_payment_date', 'next_payment_date',
             'last_credit_pull_date']

categorical_data = ['member_id', 'term', 'int_rate', 'grade', 'sub_grade', 'employment_length', 'home_ownership', 'verification_status', 'loan_status', 
                    'payment_plan', 'purpose', 'policy_code', 'application_type']

non_numeric_data = date_data + categorical_data 
column_headings = database.columns.values.tolist()
numeric_data = [column for column in column_headings if column not in non_numeric_data]

for date_column in date_data:
    database, date_column = cleaned_data.date_data(database, date_column)

for categories in categorical_data:
    database, categories = cleaned_data.change_type(database, categories, 'category')

df_info = db_utils.DataFrameInfo(database)

for column in column_headings:
    null_vals, null_percentage = df_info.missing(database, column)
    null_percentage = round(null_percentage, 2)
    if null_percentage > float(50):
        database = database.drop(column, axis=1)
    elif float(0) < null_percentage < float(50):
        print(f'{column}: {null_vals} null values, {null_percentage}%')

funded_amount_column = list(database['funded_amount'])
loan_amount_column = list(database['loan_amount'])
new_sub_grade = sub_grade_list = list(database['sub_grade'])
new_int_rate = int_rate_list = list(database['int_rate'])

database = database.sort_values(by=['int_rate'])
credentials.save_file(database, 'new_file_2.csv')

for index in range(54231):
    if np.isnan(funded_amount_column[index]):
        funded_amount_column[index] = loan_amount_column[index]
    
    if np.isnan(int_rate_list[index]):
        new_int_rate[index] = new_sub_grade[index] = 0

int_rate_amount = dict(zip(new_sub_grade, new_int_rate))

for position in range(54231):
    if int_rate_list[position] == 0:
        new_int_rate[position] = int_rate_amount.get(sub_grade_list[position])

database['funded_amount'] = funded_amount_column
database['int_rate'] = new_int_rate

print(df_info.info(database))




funded_amount: 3007 null values, 5.54%
term: 4772 null values, 8.8%
int_rate: 5169 null values, 9.53%
employment_length: 2118 null values, 3.91%
last_payment_date: 73 null values, 0.13%
last_credit_pull_date: 7 null values, 0.01%
collections_12_mths_ex_med: 51 null values, 0.09%
<class 'pandas.core.frame.DataFrame'>
Index: 54231 entries, 784950 to 117192
Data columns (total 38 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   member_id                   54231 non-null  category      
 1   loan_amount                 54231 non-null  int64         
 2   funded_amount               54231 non-null  float64       
 3   funded_amount_inv           54231 non-null  float64       
 4   term                        49459 non-null  category      
 5   int_rate                    54231 non-null  float64       
 6   instalment                  54231 non-null  float64       
 7   grade                       5