In [2]:
import db_utils
import yaml, pandas as pd, numpy as np

'''
Loading the data in the yaml file and saving it as 
a csv file on the local machine
'''

with open('credentials.yaml') as file:
    credentials_dict = yaml.safe_load(file)
    
credentials = db_utils.RDSDatabaseConnector(credentials_dict)
loan_payments = credentials.initialise_database()
credentials.save_file(loan_payments, 'new_file.csv')


'''
Loading the data and adjusting the data types of each column of the dataframe
where needed as well as ensuring the formatting of the data (specifically those 
that consist of dates) are all the same
'''

database = db_utils.load_csv('new_file.csv')
cleaned_data = db_utils.DataTransform(database)

date_data = ['issue_date', 'earliest_credit_line', 'last_payment_date', 'next_payment_date',
             'last_credit_pull_date']

categorical_data = ['member_id', 'term', 'int_rate', 'grade', 'sub_grade', 'employment_length', 'home_ownership', 'verification_status', 'loan_status', 
                    'payment_plan', 'purpose', 'policy_code', 'application_type']

non_numeric_data = date_data + categorical_data 
column_headings = database.columns.values.tolist()
numeric_data = [column for column in column_headings if column not in non_numeric_data]

for date_column in date_data:
    database, date_column = cleaned_data.date_data(database, date_column)

for categories in categorical_data:
    database, categories = cleaned_data.change_type(database, categories, 'category')


'''
Dropping columns whose data consists of more than 50% of null values
and filling in the null values for the other columns with null values
'''

df_info = db_utils.DataFrameInfo(database)
df_transform = db_utils.DataFrameTransform(database)
df_plot = db_utils.Plotter(database)

for column in column_headings:
    null_vals, null_percentage = df_info.missing(database, column)
    if null_percentage > float(50):
        database = database.drop(column, axis=1)

database = database.sort_values(by=['sub_grade'])
database['int_rate'] = database['int_rate'].ffill()
database['funded_amount'] = database.loc[:, 'loan_amount']
fill_values = {'collections_12_mths_ex_med': 0, 'term': database['term'].mode()[0]}
database = df_transform.fill_null(database, values= fill_values)

last_payment_date_list= db_utils.make_list(database, 'last_payment_date')
last_credit_pull_date_list = db_utils.make_list(database, 'last_credit_pull_date')

for index in range(54231):
    if pd.isnull(last_payment_date_list[index]):
        last_payment_date_list[index] = last_credit_pull_date_list[index]
    
    if pd.isnull(last_credit_pull_date_list[index]):
        last_credit_pull_date_list[index] = last_payment_date_list[index]

database['last_payment_date'] = last_payment_date_list
database['last_credit_pull_date'] = last_credit_pull_date_list
database = database.dropna(axis = 0)


'''
Finding the skew of the columns made of integers and floats in the
dataframe
'''

df_skew = database.skew(axis= 0, numeric_only= True)
skewed_data = database[[skewed_col for skewed_col, skew_value in df_skew.items() if skew_value > 2]]
#df_plot.plot_hist(skewed_data)

log_skewed_data = skewed_data.copy()
for column in skewed_data:
    log_skewed_data[column] = skewed_data[column].map(lambda i: np.log(i) if i > 0 else 0)

log_skew = log_skewed_data.skew(axis = 0)
for skew_col, value in log_skew.items():
    print(f'{skew_col}\n Skew before: {df_skew.get(key = skew_col)}\n Skew after: {new_skew.get(key = skew_col)}')

#df_plot.plot_hist(log_skewed_data)

annual_inc
 Skew before: 8.766706946736596
 Skew after: 0.19223329606616388
delinq_2yrs
 Skew before: 5.3302689023540095
 Skew after: 5.387891655078983
inq_last_6mths
 Skew before: 3.2843306955572724
 Skew after: 1.9673229236482324
out_prncp
 Skew before: 2.358972868475207
 Skew after: 0.5976812020075605
out_prncp_inv
 Skew before: 2.3594154508797995
 Skew after: 0.5977595411717821
total_rec_int
 Skew before: 2.1940362256545747
 Skew after: -0.9270604653976019
total_rec_late_fee
 Skew before: 13.18644093613826
 Skew after: 5.475608583172492
recoveries
 Skew before: 14.133928797708249
 Skew after: 3.7893025396272577
collection_recovery_fee
 Skew before: 27.505679057194968
 Skew after: 5.386045962220784
last_payment_amount
 Skew before: 2.478841250388088
 Skew after: 0.042539683021739765
collections_12_mths_ex_med
 Skew before: 20.385589060161674
 Skew after: 66.14572498529017
