In [17]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

import joblib
from joblib import dump
from sklearn.ensemble import RandomForestClassifier

    
# Read the Data
df_x = pd.read_csv('./data/x_train_ohot_en.csv')
df_y = pd.read_csv('./data/y_train_ohot_en.csv')

df_x_test = pd.read_csv('./data/x_test_ohot_en.csv')
df_y_test = pd.read_csv('./data/y_test_ohot_en.csv')

# Oversampling
# sm = SMOTE()\
rus = RandomUnderSampler(random_state=42)

X_train, y_train = rus.fit_resample(df_x, df_y) 
X_test, y_test = rus.fit_resample(df_x_test, df_y_test) 


# Scaling
cols = list(X_train.columns)
scaler = MinMaxScaler()

    
X_train_s = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train_s, columns=list(X_train.columns))

X_test_s = scaler.fit_transform(X_test)
X_test = pd.DataFrame(X_test_s, columns=list(X_test.columns))


# # Initialize and train the Random Forest model
rf_pipeline = Pipeline([('classifier', RandomForestClassifier())])

rf_pipeline.fit(X_train, y_train)

# Testing the Pipeline

y_pred = rf_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy: {} %'.format(100 * accuracy_score(y_test, y_pred)))

# Saving the trained model

dump(rf_pipeline, 'rf_classifier.joblib')

joblib.dump(rf_pipeline, 'trained_model.pkl')

  return fit_method(estimator, *args, **kwargs)


              precision    recall  f1-score   support

         0.0       0.98      0.95      0.96       491
         1.0       0.95      0.97      0.96       491
         2.0       0.96      0.98      0.97       491
         3.0       0.80      0.77      0.79       491
         4.0       0.62      0.55      0.58       491
         5.0       0.66      0.64      0.65       491
         6.0       0.81      0.95      0.87       491

    accuracy                           0.83      3437
   macro avg       0.83      0.83      0.83      3437
weighted avg       0.83      0.83      0.83      3437

Accuracy: 82.921152167588 %


['trained_model.pkl']

In [52]:
scaler_params = pd.DataFrame(columns=['feature_name','min_','std_'])
for c in range(len(cols)):
    scaler_params_stg = pd.DataFrame.from_dict({"feature_name": [cols[c]],  
                                                "min_": [X_train[cols[c]].min()],  
                                                "std_": [X_train[cols[c]].std()]})
    scaler_params = pd.concat([scaler_params, scaler_params_stg], ignore_index = True) 

scaler_params    
    # scaled_data[c] = all_scalers[c].minmax_scaler(X_train[c])

Unnamed: 0,feature_name,min_,std_
0,term,0.00000,0.497187
1,emp_length,0.00000,3.109631
2,loan_amnt,0.00010,0.000941
3,funded_amnt,0.00010,0.000941
4,int_rate,0.00067,0.000973
...,...,...,...
88,ohe__addr_state_WI,0.00000,0.104826
89,ohe__addr_state_WV,0.00000,0.045131
90,ohe__addr_state_WY,0.00000,0.052095
91,ohe__application_type_Individual,0.00000,0.139590


In [58]:
def std_scaler(x,min_,std_):
    return (x - min_) / std_

for i in range(len(cols)):
    min_info = scaler_params[scaler_params.feature_name == cols[i]].min_
    stf_info = scaler_params[scaler_params.feature_name == cols[i]].std_
    
    X_train[cols[i]] = X_train.apply(lambda x: std_scaler(x[cols[i]],min_info,stf_info),axis=1)
    

In [59]:
X_train.head(2)

Unnamed: 0,term,emp_length,loan_amnt,funded_amnt,int_rate,installment,annual_inc,dti,fico_range_low,fico_range_high,...,ohe__addr_state_TX,ohe__addr_state_UT,ohe__addr_state_VA,ohe__addr_state_VT,ohe__addr_state_WA,ohe__addr_state_WI,ohe__addr_state_WV,ohe__addr_state_WY,ohe__application_type_Individual,ohe__application_type_JointApp
208220,0.0,0.0,0.427199,0.427199,0.0,0.408192,1.147678,1.462101,1.35682,1.356801,...,3.545269,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.163839,0.0
206713,0.0,0.0,0.640799,0.640799,0.345636,0.642189,0.789892,0.574995,1.187217,1.187201,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.163839,0.0


In [6]:
# def transformer(inp):
#     output = {}
#     for c in cols:
#         output[c] = all_scalers[c].minmax_scaler(inp[c])
#     return output

In [12]:
# all_scalers["term"]

<__main__.scaler at 0x106a45e50>

{}

In [12]:
path_train = f'/Users/kristinazekiene/Documents/LearningPython/TuringCollege/M3S3/output_grade/BQ/x_train.csv'
path_test = f'/Users/kristinazekiene/Documents/LearningPython/TuringCollege/M3S3/output_grade/BQ/x_test.csv'
X_train.to_csv(path_train,index=False)
X_test.to_csv(path_test,index=False)

path_ytrain = f'/Users/kristinazekiene/Documents/LearningPython/TuringCollege/M3S3/output_grade/BQ/y_train.csv'
path_ytest = f'/Users/kristinazekiene/Documents/LearningPython/TuringCollege/M3S3/output_grade/BQ/y_test.csv'
y_train.to_csv(path_ytrain,index=False)
y_test.to_csv(path_ytest,index=False)

In [4]:
# df_x = pd.read_csv('./data/x_train_ohot_en.csv')
# df_y = pd.read_csv('./data/y_train_ohot_en.csv')

# df_x = df_x.drop(columns=['emp_title_mod'])
# df_x = df_x.rename(columns={
#         'ohe__verification_status_Not Verified': 'ohe__verification_status_NotVerified',
#         'ohe__verification_status_Source Verified':'ohe__verification_status_SourceVerified',
#         'ohe__application_type_Joint App':'ohe__application_type_JointApp'})

# df_x_test = pd.read_csv('./data/x_test_ohot_en.csv')
# df_y_test = pd.read_csv('./data/y_test_ohot_en.csv')

# df_x_test = df_x_test.drop(columns=['emp_title_mod'])
# df_x_test = df_x_test.rename(columns={
#         'ohe__verification_status_Not Verified': 'ohe__verification_status_NotVerified',
#         'ohe__verification_status_Source Verified':'ohe__verification_status_SourceVerified',
#         'ohe__application_type_Joint App':'ohe__application_type_JointApp'})

In [10]:
# path_train = f'/Users/kristinazekiene/Documents/LearningPython/TuringCollege/M3S3/loan_grade_deployment_fastapi/app/data/x_train_ohot_en.csv'
# path_test = f'/Users/kristinazekiene/Documents/LearningPython/TuringCollege/M3S3/loan_grade_deployment_fastapi/app/data/x_test_ohot_en.csv'
# df_x.to_csv(path_train,index=False)
# df_x_test.to_csv(path_test,index=False)

In [34]:
df_x_test = pd.read_csv('./data/x_test_ohot_en.csv')
df_y_test = pd.read_csv('./data/y_test_ohot_en.csv')
df_x_test.head(3)

Unnamed: 0,term,emp_length,loan_amnt,funded_amnt,int_rate,installment,annual_inc,dti,fico_range_low,fico_range_high,...,ohe__addr_state_TX,ohe__addr_state_UT,ohe__addr_state_VA,ohe__addr_state_VT,ohe__addr_state_WA,ohe__addr_state_WI,ohe__addr_state_WV,ohe__addr_state_WY,ohe__application_type_Individual,ohe__application_type_JointApp
0,0.0,1.0,0.003218,0.003218,0.004601,0.003877,0.002703,0.007333,0.003582,0.003581,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.004023,0.004023,0.003653,0.004598,0.001952,0.00247,0.003531,0.003531,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,6.0,0.004023,0.004023,0.002891,0.004404,0.001952,0.000753,0.003506,0.003506,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [97]:
i = 0 #14, 24 - A, 26 - F,
ordinal_mapping_grade = {'0.0': 'A', '1.0': 'B', '2.0': 'C', '3.0': 'D', '4.0': 'E.0', '5.0': 'F', '6.0': 'G'}
print(df_y_test.iloc[i])
print(ordinal_mapping_grade[str(df_y_test.grade[i])])

prod_item = df_x_test.iloc[i]
prod_item

grade    3.0
Name: 0, dtype: float64
D


term                                0.000000
emp_length                          1.000000
loan_amnt                           0.003218
funded_amnt                         0.003218
int_rate                            0.004601
                                      ...   
ohe__addr_state_WI                  0.000000
ohe__addr_state_WV                  0.000000
ohe__addr_state_WY                  0.000000
ohe__application_type_Individual    1.000000
ohe__application_type_JointApp      0.000000
Name: 0, Length: 93, dtype: float64

In [80]:
result = prod_item.to_json(orient="index")
result

'{"term":1.0,"emp_length":1.0,"loan_amnt":0.0060345034,"funded_amnt":0.0060345401,"int_rate":0.0060777762,"installment":0.0057780699,"annual_inc":0.0033041801,"dti":0.0062541933,"fico_range_low":0.0033545924,"fico_range_high":0.0033555034,"inq_last_6mths":0.0102315213,"open_acc":0.0055387428,"pub_rec":0.0,"revol_bal":0.0087539935,"revol_util":0.0049161341,"cr_line_duration":0.0068180658,"mths_since_last_delinq":0.0,"delinq_2yrs":0.0034590068,"total_pymnt":0.0057866291,"total_rec_prncp":0.0029814703,"total_rec_int":0.0150874339,"ohe__home_ownership_ANY":0.0,"ohe__home_ownership_MORTGAGE":1.0,"ohe__home_ownership_OWN":0.0,"ohe__home_ownership_RENT":0.0,"ohe__verification_status_NotVerified":0.0,"ohe__verification_status_SourceVerified":0.0,"ohe__verification_status_Verified":1.0,"ohe__purpose_car":0.0,"ohe__purpose_credit_card":0.0,"ohe__purpose_debt_consolidation":1.0,"ohe__purpose_home_improvement":0.0,"ohe__purpose_house":0.0,"ohe__purpose_major_purchase":0.0,"ohe__purpose_medical":0.

In [81]:
from json import loads, dumps
parsed = loads(result)
parsed

{'term': 1.0,
 'emp_length': 1.0,
 'loan_amnt': 0.0060345034,
 'funded_amnt': 0.0060345401,
 'int_rate': 0.0060777762,
 'installment': 0.0057780699,
 'annual_inc': 0.0033041801,
 'dti': 0.0062541933,
 'fico_range_low': 0.0033545924,
 'fico_range_high': 0.0033555034,
 'inq_last_6mths': 0.0102315213,
 'open_acc': 0.0055387428,
 'pub_rec': 0.0,
 'revol_bal': 0.0087539935,
 'revol_util': 0.0049161341,
 'cr_line_duration': 0.0068180658,
 'mths_since_last_delinq': 0.0,
 'delinq_2yrs': 0.0034590068,
 'total_pymnt': 0.0057866291,
 'total_rec_prncp': 0.0029814703,
 'total_rec_int': 0.0150874339,
 'ohe__home_ownership_ANY': 0.0,
 'ohe__home_ownership_MORTGAGE': 1.0,
 'ohe__home_ownership_OWN': 0.0,
 'ohe__home_ownership_RENT': 0.0,
 'ohe__verification_status_NotVerified': 0.0,
 'ohe__verification_status_SourceVerified': 0.0,
 'ohe__verification_status_Verified': 1.0,
 'ohe__purpose_car': 0.0,
 'ohe__purpose_credit_card': 0.0,
 'ohe__purpose_debt_consolidation': 1.0,
 'ohe__purpose_home_improveme

In [33]:
pd.DataFrame.from_dict([parsed])

Unnamed: 0,term,emp_length,loan_amnt,funded_amnt,int_rate,installment,annual_inc,dti,fico_range_low,fico_range_high,...,ohe__addr_state_TX,ohe__addr_state_UT,ohe__addr_state_VA,ohe__addr_state_VT,ohe__addr_state_WA,ohe__addr_state_WI,ohe__addr_state_WV,ohe__addr_state_WY,ohe__application_type_Individual,ohe__application_type_JointApp
0,0.0,1.0,0.003218,0.003218,0.004601,0.003877,0.002703,0.007333,0.003582,0.003581,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


After MinMAX scaler

In [18]:
X_test
y_test = y_test.reset_index()

In [52]:
i = 895 #0, 1000, 2000, 2500
ordinal_mapping_grade = {'0.0': 'A', '1.0': 'B', '2.0': 'C', '3.0': 'D', '4.0': 'E', '5.0': 'F', '6.0': 'G'}
print(y_test.iloc[i])
print(ordinal_mapping_grade[str(y_test.grade[i])])

prod_item = X_test.iloc[i]
prod_item

index    18867.0
grade        1.0
Name: 895, dtype: float64
B


term                                0.000000
emp_length                          0.200000
loan_amnt                           0.871795
funded_amnt                         0.871795
int_rate                            0.119595
                                      ...   
ohe__addr_state_WI                  0.000000
ohe__addr_state_WV                  0.000000
ohe__addr_state_WY                  0.000000
ohe__application_type_Individual    1.000000
ohe__application_type_JointApp      0.000000
Name: 895, Length: 93, dtype: float64

In [53]:
result = prod_item.to_json(orient="index")
result

'{"term":0.0,"emp_length":0.2,"loan_amnt":0.8717948718,"funded_amnt":0.8717948718,"int_rate":0.1195948578,"installment":0.7275600721,"annual_inc":0.0807174888,"dti":0.2895225464,"fico_range_low":0.1666666667,"fico_range_high":0.1666666667,"inq_last_6mths":0.0,"open_acc":0.2708333333,"pub_rec":0.0,"revol_bal":0.0610566267,"revol_util":0.4715189873,"cr_line_duration":0.387755102,"mths_since_last_delinq":0.0,"delinq_2yrs":0.0,"total_pymnt":0.6666953152,"total_rec_prncp":0.8476235,"total_rec_int":0.1950433878,"ohe__home_ownership_ANY":0.0,"ohe__home_ownership_MORTGAGE":1.0,"ohe__home_ownership_OWN":0.0,"ohe__home_ownership_RENT":0.0,"ohe__verification_status_NotVerified":0.0,"ohe__verification_status_SourceVerified":1.0,"ohe__verification_status_Verified":0.0,"ohe__purpose_car":0.0,"ohe__purpose_credit_card":0.0,"ohe__purpose_debt_consolidation":0.0,"ohe__purpose_home_improvement":1.0,"ohe__purpose_house":0.0,"ohe__purpose_major_purchase":0.0,"ohe__purpose_medical":0.0,"ohe__purpose_moving

In [54]:
from json import loads, dumps
parsed = loads(result)
parsed

{'term': 0.0,
 'emp_length': 0.2,
 'loan_amnt': 0.8717948718,
 'funded_amnt': 0.8717948718,
 'int_rate': 0.1195948578,
 'installment': 0.7275600721,
 'annual_inc': 0.0807174888,
 'dti': 0.2895225464,
 'fico_range_low': 0.1666666667,
 'fico_range_high': 0.1666666667,
 'inq_last_6mths': 0.0,
 'open_acc': 0.2708333333,
 'pub_rec': 0.0,
 'revol_bal': 0.0610566267,
 'revol_util': 0.4715189873,
 'cr_line_duration': 0.387755102,
 'mths_since_last_delinq': 0.0,
 'delinq_2yrs': 0.0,
 'total_pymnt': 0.6666953152,
 'total_rec_prncp': 0.8476235,
 'total_rec_int': 0.1950433878,
 'ohe__home_ownership_ANY': 0.0,
 'ohe__home_ownership_MORTGAGE': 1.0,
 'ohe__home_ownership_OWN': 0.0,
 'ohe__home_ownership_RENT': 0.0,
 'ohe__verification_status_NotVerified': 0.0,
 'ohe__verification_status_SourceVerified': 1.0,
 'ohe__verification_status_Verified': 0.0,
 'ohe__purpose_car': 0.0,
 'ohe__purpose_credit_card': 0.0,
 'ohe__purpose_debt_consolidation': 0.0,
 'ohe__purpose_home_improvement': 1.0,
 'ohe__purpo