In [None]:
# Importing general packages
import pandas as pd
import numpy as np

In [None]:
# Reading treated dataframe
loan_df = pd.read_csv('../raw_data/treated_df.csv', index = False)

## Preprocessing Pipeline

In [None]:
# loan_df = pd.read_csv('../raw_data/treated_df.csv')
y = loan_df[['good_bad']]
X = loan_df.drop(columns = 'good_bad')

In [None]:
X.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,emp_length,home_ownership,annual_inc,purpose,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim
0,5000,5000,4975.0,36 months,10.65,162.87,B,10 years or more,RENT,24000.0,credit card,27.65,0.0,1.0,3.0,0.0,13648,83.7,9.0,0.0,0.0,5861.071414,5831.78,5000.0,861.07,0.0,0.0,0.0,171.62,0.0,0.0,,,
1,2500,2500,2500.0,60 months,15.27,59.83,C,0,RENT,30000.0,"home improvement, major purchase or car",1.0,0.0,5.0,3.0,0.0,1687,9.4,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,119.66,0.0,0.0,,,
2,2400,2400,2400.0,36 months,15.96,84.33,C,10 years or more,RENT,12252.0,small business or educational,8.72,0.0,2.0,2.0,0.0,2956,98.5,10.0,0.0,0.0,3003.653644,3003.65,2400.0,603.65,0.0,0.0,0.0,649.91,0.0,0.0,,,
3,10000,10000,10000.0,36 months,13.49,339.31,C,10 years or more,RENT,49200.0,"renewable energy, moving, house or other",20.0,0.0,1.0,10.0,0.0,5598,21.0,37.0,0.0,0.0,12226.30221,12226.3,10000.0,2209.33,16.97,0.0,0.0,357.48,0.0,0.0,,,
4,3000,3000,3000.0,60 months,12.69,67.79,B,1 year,RENT,80000.0,"renewable energy, moving, house or other",17.94,0.0,0.0,15.0,0.0,27783,53.9,38.0,766.9,766.9,3242.17,3242.17,2233.1,1009.07,0.0,0.0,0.0,67.79,0.0,0.0,,,


In [None]:
y.head()

Unnamed: 0,good_bad
0,1
1,0
2,1
3,1
4,1


In [None]:
# Separating columns by preprocessing type

cols_cat = loan_df.select_dtypes('object').columns

cols_robust = ['total_rev_hi_lim', 'int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_amnt', 'collections_12_mths_ex_med', 'tot_coll_amt', 'acc_now_delinq', 'tot_cur_bal', 'total_rev_hi_lim']

cols_std = ['loan_amnt', 'funded_amnt', 'funded_amnt_inv']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, stratify = y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Stratify garante que a proporção do y no treino e no teste são iguais. Não balanceia o dataset

# This stratify parameter makes a split so that the proportion of values in the sample produced will be the same as the proportion of values provided to parameter stratify.

# For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's.

# Ainda temos que BALANCEAR o dataset -> Oversampling ou Undersampling. SMOTE?
# E então, aplicar no dado de teste.

((373028, 34), (93257, 34), (373028, 1), (93257, 1))

In [None]:
# BALANCING THE TRAINING SET



In [None]:
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression

# Impute then scale numerical values: 
# Features with no outliers
standard_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('standard_scaler', StandardScaler())
])

# Features with outliers
robust_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('robust_scaler', RobustScaler())
])

# Encode categorical values
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Parallelize the transformers
preprocessor = ColumnTransformer([
    ('standard_scaler', standard_transformer, cols_std), 
    ('robust_scaler', robust_transformer, cols_robust),
    ('cat_transformer', cat_transformer, cols_cat)
])

# Adicionando o modelo na pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessor)
    , ('model', LogisticRegression())
    ])

pipeline

In [None]:
# Fazer o fit_transform só no treino
# Fazer transform no teste
# Verificar se o modelo deu certo -> evaluate ou scoring

X_train_transformed = preprocessor.fit_transform(X_train)

pd.DataFrame(
    X_train_transformed, 
    columns=preprocessor.get_feature_names_out()
).head()

Unnamed: 0,standard_scaler__loan_amnt,standard_scaler__funded_amnt,standard_scaler__funded_amnt_inv,robust_scaler__total_rev_hi_lim,robust_scaler__int_rate,robust_scaler__installment,robust_scaler__annual_inc,robust_scaler__dti,robust_scaler__delinq_2yrs,robust_scaler__inq_last_6mths,robust_scaler__open_acc,robust_scaler__pub_rec,robust_scaler__revol_bal,robust_scaler__revol_util,robust_scaler__total_acc,robust_scaler__out_prncp,robust_scaler__out_prncp_inv,robust_scaler__total_pymnt,robust_scaler__total_pymnt_inv,robust_scaler__total_rec_prncp,robust_scaler__total_rec_int,robust_scaler__total_rec_late_fee,robust_scaler__recoveries,robust_scaler__collection_recovery_fee,robust_scaler__last_pymnt_amnt,robust_scaler__collections_12_mths_ex_med,robust_scaler__tot_coll_amt,robust_scaler__acc_now_delinq,robust_scaler__tot_cur_bal,robust_scaler__total_rev_hi_lim.1,cat_transformer__term_ 36 months,cat_transformer__term_ 60 months,cat_transformer__grade_A,cat_transformer__grade_B,cat_transformer__grade_C,cat_transformer__grade_D,cat_transformer__grade_E,cat_transformer__grade_F,cat_transformer__grade_G,cat_transformer__emp_length_0,cat_transformer__emp_length_1 year,cat_transformer__emp_length_10 years or more,cat_transformer__emp_length_2 to 4 years,cat_transformer__emp_length_5 to 6 years,cat_transformer__emp_length_7 to 9 years,cat_transformer__emp_length_Unemployed,cat_transformer__home_ownership_MORTGAGE,cat_transformer__home_ownership_OTHER,cat_transformer__home_ownership_OWN,cat_transformer__home_ownership_RENT,cat_transformer__purpose_credit card,cat_transformer__purpose_debt consolidation,"cat_transformer__purpose_home improvement, major purchase or car","cat_transformer__purpose_medical, wedding or vacation","cat_transformer__purpose_renewable energy, moving, house or other",cat_transformer__purpose_small business or educational
0,-0.375876,-0.373309,-0.363862,0.233161,0.532727,-0.335655,-0.297959,-0.171329,0.0,3.0,0.333333,0.0,-0.16814,-0.650704,-0.666667,-0.063398,-0.06333,0.2906,0.297855,0.528867,-0.32875,0.0,0.0,0.0,3.512422,0.0,0.0,0.0,0.41579,0.233161,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,-0.424185,-0.421688,-0.412105,0.0,-1.387273,-0.164967,0.939716,-0.447552,0.0,0.0,0.166667,0.0,-0.302566,-1.214085,0.066667,-0.063398,-0.06333,0.247597,0.25476,0.480625,-0.336824,0.0,0.0,0.0,0.15345,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.170027,1.174833,1.179906,-0.404145,-1.387273,1.133283,-0.229199,0.657343,0.0,1.0,-0.5,0.0,-0.610452,-1.008451,1.0,-0.063398,-0.06333,1.509376,1.519233,2.072628,-0.724458,0.0,0.0,0.0,8.24178,0.0,0.0,0.0,-0.078552,-0.404145,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2.468343,2.475031,2.476429,1.207254,1.241818,2.974164,4.28602,-1.371503,6.0,3.0,0.333333,4.0,-0.06901,-0.96338,1.066667,-0.063398,-0.06333,1.035254,1.044099,0.621434,2.439186,0.0,0.0,0.0,0.263808,1.0,245.0,0.0,2.532352,1.207254,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.686932,0.691039,0.697478,0.0,0.06,0.981403,0.06876,0.903846,0.0,1.0,0.166667,0.0,0.419524,-0.307042,1.066667,-0.063398,-0.06333,1.415266,1.424922,1.590203,0.591222,0.0,0.0,0.0,4.102367,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
