# Model Build and Analysis

In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
import statsmodels.api as sm          # Importing statsmodels for linear model 
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score as rsq
#Random Forest Regrssor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# for integer encoding using feature-engine
from feature_engine.categorical_encoders import OrdinalCategoricalEncoder
#for XGBOOST 
import xgboost as xgb

In [2]:
# reading the data
data=pd.read_csv(r"data_no_null.csv",encoding='latin1')# reading the data

In [3]:
data.drop(columns='Unnamed: 0',inplace=True,axis=1)

# preprocessing and data cleaning 

### droping some unnecessary columns 

deletion1:- here i m dropping subgrade column because its cardinality is high as well as if i will group them then it will give similar column which i already have as Grade 

deletion2:- dropping state columns because it have ['FL', 'MD', 'OH', ..., 'NH', 'MD', 'FL'] these types of values which also not giving any infrmation about the states so irrelevent for user input 

deletion3:-i m deleting this EMP_designation column because it is having high cardinality and not much information to group it , and make it low cardinality 

deletion4 :- i m deleting this last_week_pay column because it is having high cardinality and not much information to group it , and make it low cardinality



In [4]:
# list of columns not necessary for prediction
cols_to_drop=['sub_grade','State','Emp_designation','last_week_pay']
# dropping the unnecessary columns
data_1=data.drop(columns=cols_to_drop,axis=1)

In [None]:
data_1.columns

In [5]:
categorical = [col for col in data_1.columns if data_1[col].dtypes == 'O']

categorical

['terms',
 'grade',
 'home_ownership',
 'verification_status',
 'purpose',
 'initial_list_status',
 'application_type',
 'Experience']

In [None]:
data_1

## Here , i m making 'purpose' column as low cardinality by grouping labels to similar labels
exaple:-

        lable1:-debt_consolidation                as debt_consolidation

        label2:-credit_card                       as credit_card
        
        label3:-home_improvement                  as home_improvement
        
        lable4:-major_purchase+car+house          as major_purchase
        
        lable5:-small_business+renewable_energy   as small_business
        
        label6:-weddings+others+vacations+moving  as others
        
        label7:-medical+educations                as medical(or)education


In [6]:
data.purpose.value_counts().sort_values()

educational              423
renewable_energy         575
wedding                 2347
house                   3707
vacation                4736
moving                  5414
medical                 8540
car                     8863
small_business         10377
major_purchase         17277
other                  42894
home_improvement       51829
credit_card           206182
debt_consolidation    524215
Name: purpose, dtype: int64

In [7]:
#df[['A','B']] = df[['A','B']].replace([1, 3, 2], [3, 6, 7])
data_1[['purpose']] = data_1[['purpose']].replace(['car','house','renewable_energy','wedding','vacation','moving','medical','educational'],
                                              ['major_purchase','major_purchase','small_business','other','other','other','medical(or)education','medical(or)education'])

In [8]:
data_1.purpose.value_counts().sort_values()

medical(or)education      8963
small_business           10952
major_purchase           29847
home_improvement         51829
other                    55391
credit_card             206182
debt_consolidation      524215
Name: purpose, dtype: int64

In [9]:
data_1.application_type.value_counts().sort_values()

JOINT            511
INDIVIDUAL    886868
Name: application_type, dtype: int64

## Train-Test split 

In [10]:
data_1.columns

Index(['loan_amnt ', 'terms', 'Rate_of_intrst', 'grade', 'home_ownership',
       'annual_inc', 'verification_status', 'purpose', 'debt_income_ratio',
       'delinq_2yrs', 'inq_last_6mths', 'numb_credit', 'pub_rec',
       'total revol_bal', 'total_credits', 'initial_list_status',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'collections_12_mths_ex_med',
       'application_type', 'acc_now_delinq', 'Experience',
       'mths_since_last_delinq', 'tot_curr_bal', 'tot_colle_amt'],
      dtype='object')

In [11]:
# to split the datasets
from sklearn.model_selection import train_test_split

# for one hot encoding with feature-engine
from feature_engine.categorical_encoders import OneHotCategoricalEncoder

In [12]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data_1[['loan_amnt ', 'terms', 'Rate_of_intrst', 'grade', 'home_ownership','annual_inc', 'verification_status',
            'purpose', 'debt_income_ratio','delinq_2yrs', 'inq_last_6mths', 'numb_credit', 'pub_rec','total_credits', 
            'initial_list_status','total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 
            'collections_12_mths_ex_med', 'application_type', 'acc_now_delinq', 'Experience', 'mths_since_last_delinq',
            'tot_curr_bal', 'tot_colle_amt']],  # predictors
    data_1[['total revol_bal']],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((621165, 26), (266214, 26))

In [13]:
categorical = [col for col in X_train.columns if X_train[col].dtypes == 'O']

categorical

['terms',
 'grade',
 'home_ownership',
 'verification_status',
 'purpose',
 'initial_list_status',
 'application_type',
 'Experience']

### One-Hot Encoding
    -  It simply creates additional features based on the number of unique values in the categorical feature. Every unique            value in the category will be added as a feature. One-Hot Encoding is the process of creating dummy variables.

In [14]:
ohe_enc = OneHotCategoricalEncoder(
    variables=categorical,
    drop_last=False)

ohe_enc.fit(X_train)

OneHotCategoricalEncoder(variables=['terms', 'grade', 'home_ownership',
                                    'verification_status', 'purpose',
                                    'initial_list_status', 'application_type',
                                    'Experience'])

In [15]:
# in the encoder dict we can observe each of the top categories
# selected for each of the variables

ohe_enc.encoder_dict_

{'terms': array(['60 months', '36 months'], dtype=object),
 'grade': array(['D', 'C', 'A', 'B', 'F', 'E', 'G'], dtype=object),
 'home_ownership': array(['MORTGAGE', 'RENT', 'OWN', 'OTHER', 'NONE', 'ANY'], dtype=object),
 'verification_status': array(['Verified', 'Source Verified', 'Not Verified'], dtype=object),
 'purpose': array(['debt_consolidation', 'credit_card', 'home_improvement',
        'small_business', 'other', 'major_purchase',
        'medical(or)education'], dtype=object),
 'initial_list_status': array(['w', 'f'], dtype=object),
 'application_type': array(['INDIVIDUAL', 'JOINT'], dtype=object),
 'Experience': array(['10+ years', '2 years', '4 years', '< 1 year', '3 years',
        '7 years', '8 years', '1 year', '6 years', '5 years', '9 years'],
       dtype=object)}

In [16]:
X_train = ohe_enc.transform(X_train)
X_test = ohe_enc.transform(X_test)

# let's explore the result
X_train.head()

Unnamed: 0,loan_amnt,Rate_of_intrst,annual_inc,debt_income_ratio,delinq_2yrs,inq_last_6mths,numb_credit,pub_rec,total_credits,total_rec_int,...,Experience_2 years,Experience_4 years,Experience_< 1 year,Experience_3 years,Experience_7 years,Experience_8 years,Experience_1 year,Experience_6 years,Experience_5 years,Experience_9 years
416835,35000,16.55,125000.0,25.83,0.0,3.0,19.0,0.0,52.0,911.92,...,0,0,0,0,0,0,0,0,0,0
596073,10000,12.29,24000.0,13.7,0.0,1.0,6.0,0.0,27.0,901.03,...,1,0,0,0,0,0,0,0,0,0
759330,8650,8.94,30000.0,11.8,0.0,0.0,5.0,0.0,21.0,710.73,...,0,1,0,0,0,0,0,0,0,0
398838,5000,11.53,66000.0,34.45,0.0,1.0,15.0,0.0,22.0,384.6,...,0,0,0,0,0,0,0,0,0,0
401686,12000,9.99,84000.0,10.31,0.0,0.0,8.0,4.0,12.0,932.96,...,0,0,1,0,0,0,0,0,0,0


In [17]:
X_test.head()

Unnamed: 0,loan_amnt,Rate_of_intrst,annual_inc,debt_income_ratio,delinq_2yrs,inq_last_6mths,numb_credit,pub_rec,total_credits,total_rec_int,...,Experience_2 years,Experience_4 years,Experience_< 1 year,Experience_3 years,Experience_7 years,Experience_8 years,Experience_1 year,Experience_6 years,Experience_5 years,Experience_9 years
372989,10000,13.99,47000.0,12.69,0.0,0.0,14.0,0.0,15.0,754.43,...,0,0,0,0,0,0,0,0,0,0
9769,18000,12.99,60000.0,19.66,0.0,0.0,9.0,0.0,23.0,6287.44,...,0,0,0,1,0,0,0,0,0,0
134461,6000,15.1,50000.0,28.01,1.0,1.0,11.0,0.0,19.0,1333.97,...,0,0,0,0,0,0,0,0,0,0
425772,12950,14.99,36899.0,15.88,0.0,2.0,7.0,0.0,21.0,2125.63,...,0,0,1,0,0,0,0,0,0,0
409080,1900,9.99,42000.0,23.06,0.0,1.0,11.0,0.0,22.0,148.69,...,0,0,1,0,0,0,0,0,0,0


In [18]:
d=data_1.iloc[[0]]
d_1=d[['loan_amnt ', 'terms', 'Rate_of_intrst', 'grade', 'home_ownership','annual_inc', 'verification_status',
            'purpose', 'debt_income_ratio','delinq_2yrs', 'inq_last_6mths', 'numb_credit', 'pub_rec','total_credits', 
            'initial_list_status','total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 
            'collections_12_mths_ex_med', 'application_type', 'acc_now_delinq', 'Experience', 'mths_since_last_delinq',
            'tot_curr_bal', 'tot_colle_amt']]

In [None]:
import pickle


In [None]:
'''#saving the model to the local file system
filename = 'finalized_model.pickle'
pickle.dump(xgb_clf, open(filename, 'wb'))'''


In [None]:
'''filehandler = encoder.pickle
pickle.dump(ohe_enc,open(filehandler,'wb'))'''

In [None]:
from sklearn.externals import joblib
joblib.dump(ohe_enc , 'model.pkl')

In [None]:
encoder = joblib.load('model.pkl')


In [None]:
#X_test = ohe_enc.transform(X_test)
d_test = encoder.transform(d_1)

In [None]:
d_1

In [None]:
d_test

# model analysis 

### scaling

In [None]:
''''# the scaler - for standardisation
from sklearn.preprocessing import StandardScaler'''

In [None]:
'''# standardisation: with the StandardScaler from sklearn

# set up the scaler
scaler = StandardScaler()

# fit the scaler to the train set, it will learn the parameters
scaler.fit(X_train)

# transform train and test sets
X_train_scaled = scaler.transform(X_train)
#X_test_scaled = scaler.transform(X_test)'''

In [None]:
#X_train_scaled

In [None]:
#joblib.dump(scaler , 'scaler.pkl')

In [None]:
#scaler = joblib.load('scaler.pkl')

In [None]:
#X_test_scaled = scaler.transform(X_test)

In [None]:
#X_test_scaled

In [None]:
#defining a normalisation function 
def normalize (x): 
    return ( (x-np.min(x))/ (max(x) - min(x)))
                                            
                                              
# applying normalize ( ) to all columns 
X_train = X_train.apply(normalize) 
y_train = y_train.apply(normalize)

### model building 

In [None]:
import statsmodels.api as sm          # Importing statsmodels
X_train = sm.add_constant(X_train)    # Adding a constant column to our dataframe
# create a first fitted model
lm_1 = sm.OLS(y_train,X_train).fit()

In [None]:
#Let's see the summary of our first linear model
print(lm_1.summary())

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

##############################################################################################################################

In [None]:
# reading the data
df=pd.read_csv(r"C:\Users\sak\Desktop\excelr_proj_1\data_no_null.csv",encoding='latin1')# reading the data

In [None]:
df.head()

In [None]:
df.drop(columns=['Unnamed: 0'],axis=1,inplace=True)

In [None]:
categorical = [col for col in df.columns if df[col].dtypes == 'O']

categorical

In [None]:
df_1=df.drop(columns=categorical,axis=1)

In [None]:
df_1.columns

In [None]:
#defining a normalisation function 
def normalize (x): 
    return ( (x-np.min(x))/ (max(x) - min(x)))

df_1 = df_1.apply(normalize)

In [None]:
df_1

In [None]:
df_1.columns

In [None]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    df_1[['loan_amnt ', 'Rate_of_intrst', 'annual_inc', 'debt_income_ratio',
       'delinq_2yrs', 'inq_last_6mths', 'numb_credit', 'pub_rec',
        'total_credits', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'collections_12_mths_ex_med', 'acc_now_delinq',
       'mths_since_last_delinq', 'tot_curr_bal', 'tot_colle_amt']],  # predictors
    df_1[['total revol_bal']],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

In [None]:
X_train = sm.add_constant(X_train)    # Adding a constant column to our dataframe
# create a first fitted model
lm_1 = sm.OLS(y_train,X_train).fit()

In [None]:
#Let's see the summary of our first linear model
print(lm_1.summary())

###############################################################################################################################

In [None]:
# reading the data
df2=pd.read_csv(r"C:\Users\sak\Desktop\excelr_proj_1\data (1).csv",encoding='latin1')# reading the data

In [None]:
df2_a=df2.drop(columns=['member_id ','batch_ID ','verification_status_joint','mths_since_last_major_derog',
                        'mths_since_last_record','mths_since_last_delinq'],axis=1)

In [None]:
df2_a.columns

In [None]:
categorical = [col for col in df2_a.columns if df2_a[col].dtypes == 'O']

categorical

In [None]:
df2_b=df2_a.drop(columns=categorical,axis=1)

In [None]:
df2_b

In [None]:
df2_c = df2_b.dropna(how='any',axis=0) 

In [None]:
df2_c

In [None]:
'''#defining a normalisation function 
def normalize (x): 
    return ( (x-np.min(x))/ (max(x) - min(x)))

df2_c = df2_c.apply(normalize)'''

In [None]:
df2_c.columns

In [None]:
X=df2_c[['loan_amnt ', 'Rate_of_intrst', 'annual_inc', 'debt_income_ratio',
       'delinq_2yrs', 'inq_last_6mths', 'numb_credit', 'pub_rec',
        'total_credits', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'collections_12_mths_ex_med', 'acc_now_delinq', 'tot_colle_amt',
       'tot_curr_bal']]
Y=df2_c[['total revol_bal']]

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
Y = sc_y.fit_transform(Y)

In [None]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(X  ,  # predictors
                                                    Y,  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

In [None]:
X_train = sm.add_constant(X_train)    # Adding a constant column to our dataframe
# create a first fitted model
lm_1 = sm.OLS(y_train,X_train).fit()

In [None]:
#Let's see the summary of our first linear model
print(lm_1.summary())

In [None]:
y_test=pd.DataFrame(data=sc_y.inverse_transform(y_test),columns=['revolving_balance'])

In [None]:
#X_test_m6 = sm.add_constant(X_test)
X_test_cons = sm.add_constant(X_test)

In [None]:
prediction= sc_y.inverse_transform(lm_1.predict(X_test_cons))

In [None]:
X_test.shape

In [None]:
y_test.insert(loc=1,column='prediction',value=prediction)

In [None]:
y_test

In [None]:
import numpy as np
from sklearn import metrics
print('RMSE :', np.sqrt(metrics.mean_squared_error(y_test['revolving_balance'], y_test['prediction'])))

###############################################################################################################################

In [None]:
# reading the data ,data having no null values 
df3=pd.read_csv(r"C:\Users\sak\Desktop\excelr_proj_1\data_no_null.csv",encoding='latin1')# reading the data

In [None]:
# list of columns not necessary for prediction
cols_to_drop=['Unnamed: 0','sub_grade','State','Emp_designation','last_week_pay']
# dropping the unnecessary columns
df3_a=df3.drop(columns=cols_to_drop,axis=1)

In [None]:
categorical = [col for col in df3_a.columns if df3_a[col].dtypes == 'O']

categorical

In [None]:
#df[['A','B']] = df[['A','B']].replace([1, 3, 2], [3, 6, 7])
df3_a[['purpose']] = df3_a[['purpose']].replace(['car','house','renewable_energy','wedding','vacation','moving','medical','educational'],
                                              ['major_purchase','major_purchase','small_business','other','other','other','medical(or)education','medical(or)education'])

In [None]:
df3_a.purpose.value_counts().sort_values()

In [None]:
from feature_engine.categorical_encoders import CountFrequencyCategoricalEncoder

In [None]:
count_enc = CountFrequencyCategoricalEncoder(
    encoding_method='frequency', # to do frequency ==> encoding_method='frequency'
    variables=categorical)

df3_encoded = count_enc.fit_transform(df3_a)

In [None]:
count_enc.encoder_dict_

In [None]:
df3_encoded.columns

In [None]:
X=df3_encoded[['loan_amnt ', 'terms', 'Rate_of_intrst', 'grade', 'home_ownership',
       'annual_inc', 'verification_status', 'purpose', 'debt_income_ratio',
       'delinq_2yrs', 'inq_last_6mths', 'numb_credit', 'pub_rec',
        'total_credits', 'initial_list_status',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'collections_12_mths_ex_med',
       'application_type', 'acc_now_delinq', 'Experience',
       'mths_since_last_delinq', 'tot_curr_bal', 'tot_colle_amt']]
Y=df3_encoded[['total revol_bal']]

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
Y = sc_y.fit_transform(Y)

In [None]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(X  ,  # predictors
                                                    Y,  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

In [None]:
X_train = sm.add_constant(X_train)    # Adding a constant column to our dataframe
# create a first fitted model
lm_1 = sm.OLS(y_train,X_train).fit()

In [None]:
#Let's see the summary of our first linear model
print(lm_1.summary())

In [None]:
y_test=pd.DataFrame(data=sc_y.inverse_transform(y_test),columns=['revolving_balance'])

In [None]:
#X_test_m6 = sm.add_constant(X_test)
X_test_cons = sm.add_constant(X_test)

In [None]:
prediction= sc_y.inverse_transform(lm_1.predict(X_test_cons))

In [None]:
X_test.shape

In [None]:
y_test.insert(loc=1,column='prediction',value=prediction)

In [None]:
y_test

In [None]:
import numpy as np
from sklearn import metrics
print('RMSE :', np.sqrt(metrics.mean_squared_error(y_test['revolving_balance'], y_test['prediction'])))

#######################################RANDOM FOREST REGRESSOR#################################

In [None]:
df3_encoded

In [None]:
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score as rsq
#Random Forest Regrssor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
df3_encoded.columns

In [None]:
categorical

In [None]:
X_c=df3_encoded[['loan_amnt ',  'Rate_of_intrst', 
       'annual_inc',   'debt_income_ratio',
       'delinq_2yrs', 'inq_last_6mths', 'numb_credit', 'pub_rec',
        'total_credits', 
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'collections_12_mths_ex_med',
        'acc_now_delinq',
       'mths_since_last_delinq', 'tot_curr_bal', 'tot_colle_amt']]
Y_c = df3_encoded[['total revol_bal']]

In [None]:
# let's separate into training and testing set

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c  ,  # predictors
                                                    Y_c,  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train_c.shape, X_test_c.shape

In [None]:
#'''params_rand={
#        'n_estimators':[100,200,500,1000],
#        'max_depth':[10,15,20],
#        'max_features':['auto', 'sqrt', 'log2'],
#        'random_state':[42]
#         }'''

In [None]:
#'''rf_model=RandomForestRegressor()
#grid_search_rf = RandomizedSearchCV(rf_model,param_distributions=params_rand,n_jobs=-1,cv=5)
#grid_search_rf.fit(X_train_c,y_train_c)
#grid_search_rf.best_estimator_
#grid_search_rf.best_params_'''

In [None]:
rf_model = RandomForestRegressor(n_estimators=20, max_depth=15, random_state=42)
rf_model.fit(X_train_c,y_train_c)

In [None]:
rf_pred_train=rf_model.predict(X_train_c) 

In [None]:
rf_train_r2=rsq(y_train_c,rf_pred_train)            
rf_train_r2

In [None]:
rf_pred_test=rf_model.predict(X_test_c)
rf_test_r2=rsq(y_test_c,rf_pred_test)
rf_test_r2

In [None]:
import numpy as np
from sklearn import metrics
print(' train RMSE :', np.sqrt(metrics.mean_squared_error(y_train_c,rf_pred_train)))
print(' test RMSE :', np.sqrt(metrics.mean_squared_error(y_test_c,rf_pred_test)))

In [None]:
k=list(rf_model.feature_importances_)
j=list(X_train_c.columns)
data ={'columns': j, 'feature importance':k}
d=pd.DataFrame(data)
d

i m droping some of the columns whose importance is not good in our model 

In [None]:
X_c_1=df3_encoded[['loan_amnt ',  'Rate_of_intrst', 
       'annual_inc',   'debt_income_ratio',
        'inq_last_6mths', 'numb_credit', 'pub_rec',
        'total_credits', 
       'total_rec_int', 
      
        
       'mths_since_last_delinq', 'tot_curr_bal', ]]
Y_c_1 = df3_encoded[['total revol_bal']]

In [None]:
# let's separate into training and testing set

X_train_c_1, X_test_c_1, y_train_c_1, y_test_c_1 = train_test_split(X_c_1  ,  # predictors
                                                    Y_c_1,  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train_c_1.shape, X_test_c_1.shape

In [None]:
rf_model_1 = RandomForestRegressor(n_estimators=25, max_depth=20, random_state=42)
rf_model_1.fit(X_train_c_1,y_train_c_1)

In [None]:
rf_pred_train_1=rf_model_1.predict(X_train_c_1) 
rf_train_r2_1=rsq(y_train_c_1,rf_pred_train_1)            
rf_train_r2_1

In [None]:
rf_pred_test_1=rf_model_1.predict(X_test_c_1)
rf_test_r2_1=rsq(y_test_c_1,rf_pred_test_1)
rf_test_r2_1

In [None]:
print(' train RMSE :', np.sqrt(metrics.mean_squared_error(y_train_c_1,rf_pred_train_1)))
print(' test RMSE :', np.sqrt(metrics.mean_squared_error(y_test_c_1,rf_pred_test_1)))

In [None]:
k_1=list(rf_model_1.feature_importances_)
j_1=list(X_train_c_1.columns)
data_1 ={'columns': j_1, 'feature importance':k_1}
d_1=pd.DataFrame(data_1)
d_1

### including categorical columns 

In [None]:
df3_encoded.columns

In [None]:
X_c_2=df3_encoded[['loan_amnt ', 'terms', 'Rate_of_intrst', 'grade', 'home_ownership',
       'annual_inc', 'verification_status', 'purpose', 'debt_income_ratio',
       'delinq_2yrs', 'inq_last_6mths', 'numb_credit', 'pub_rec',
        'total_credits', 'initial_list_status',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'collections_12_mths_ex_med',
       'application_type', 'acc_now_delinq', 'Experience',
       'mths_since_last_delinq', 'tot_curr_bal', 'tot_colle_amt' ]]
Y_c_2 = df3_encoded[['total revol_bal']]

In [None]:
# let's separate into training and testing set

X_train_c_2, X_test_c_2, y_train_c_2, y_test_c_2 = train_test_split(X_c_2  ,  # predictors
                                                    Y_c_2,  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train_c_2.shape, X_test_c_2.shape

In [None]:
rf_model_2 = RandomForestRegressor(n_estimators=25, max_depth=20, random_state=42)
rf_model_2.fit(X_train_c_2,y_train_c_2)

In [None]:
rf_pred_train_2=rf_model_2.predict(X_train_c_2) 
rf_train_r2_2=rsq(y_train_c_2,rf_pred_train_2)            
rf_train_r2_2

In [None]:
rf_pred_test_2=rf_model_2.predict(X_test_c_2)
rf_test_r2_2=rsq(y_test_c_2,rf_pred_test_2)
rf_test_r2_2

In [None]:
print(' train RMSE :', np.sqrt(metrics.mean_squared_error(y_train_c_2,rf_pred_train_2)))
print(' test RMSE :', np.sqrt(metrics.mean_squared_error(y_test_c_2,rf_pred_test_2)))

In [None]:
k_2=list(rf_model_2.feature_importances_)
j_2=list(X_train_c_2.columns)
data_2 ={'columns': j_2, 'feature importance':k_2}
d_2=pd.DataFrame(data_2)
d_2

In [None]:
# i m dropping those columns which having importance less than 1 %

In [None]:
df3_encoded.columns

In [None]:
X_c_2_a=df3_encoded[['loan_amnt ',  'Rate_of_intrst', 'grade', 'home_ownership',
       'annual_inc',  'purpose', 'debt_income_ratio',
        'inq_last_6mths', 'numb_credit', 'pub_rec',
        'total_credits', 
       'total_rec_int', 
    
         'Experience',
       'mths_since_last_delinq', 'tot_curr_bal' ]]
Y_c_2_a = df3_encoded[['total revol_bal']]

In [None]:
# let's separate into training and testing set

X_train_c_2_a, X_test_c_2_a, y_train_c_2_a, y_test_c_2_a = train_test_split(X_c_2_a  ,  # predictors
                                                    Y_c_2_a,  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train_c_2_a.shape, X_test_c_2_a.shape

In [None]:
rf_model_2_a = RandomForestRegressor(n_estimators=25, max_depth=20, random_state=42)
rf_model_2_a.fit(X_train_c_2_a,y_train_c_2_a)

In [None]:
rf_pred_train_2_a=rf_model_2_a.predict(X_train_c_2_a) 
rf_train_r2_2_a=rsq(y_train_c_2_a,rf_pred_train_2_a)            
rf_train_r2_2_a

In [None]:
rf_pred_test_2_a=rf_model_2_a.predict(X_test_c_2_a)
rf_test_r2_2_a=rsq(y_test_c_2_a,rf_pred_test_2_a)
rf_test_r2_2_a

In [None]:
print(' train RMSE :', np.sqrt(metrics.mean_squared_error(y_train_c_2_a,rf_pred_train_2_a)))
print(' test RMSE :', np.sqrt(metrics.mean_squared_error(y_test_c_2_a,rf_pred_test_2_a)))

In [None]:
k_2_a=list(rf_model_2_a.feature_importances_)
j_2_a=list(X_train_c_2_a.columns)
data_2_a ={'columns': j_2_a, 'feature importance':k_2_a}
d_2_a=pd.DataFrame(data_2_a)
d_2_a

##   xgboost with label encoder 

In [None]:
# for integer encoding using feature-engine
from feature_engine.categorical_encoders import OrdinalCategoricalEncoder

In [None]:
# reading the data
data_xg=pd.read_csv(r"C:\Users\sak\Desktop\excelr_proj_1\data_no_null.csv",encoding='latin1')# reading the data

In [None]:
# list of columns not necessary for prediction
cols_to_drop=['Unnamed: 0','sub_grade','State','Emp_designation','last_week_pay']
# dropping the unnecessary columns
df_xg=data_xg.drop(columns=cols_to_drop,axis=1)

In [None]:
categorical = [col for col in df_xg.columns if df_xg[col].dtypes == 'O']

categorical

In [None]:
#df[['A','B']] = df[['A','B']].replace([1, 3, 2], [3, 6, 7])
df_xg[['purpose']] = df_xg[['purpose']].replace(['car','house','renewable_energy','wedding','vacation','moving','medical','educational'],
                                              ['major_purchase','major_purchase','small_business','other','other','other','medical(or)education','medical(or)education'])

In [None]:
df_xg.purpose.value_counts().sort_values()

In [None]:
ordinal_enc = OrdinalCategoricalEncoder(
    encoding_method='arbitrary',
    variables=categorical)

df_xg_label_enc = ordinal_enc.fit_transform(df_xg)

In [None]:
#ordinal_enc.encoder_dict_

In [None]:
df_xg_label_enc.columns

In [None]:
X_xg_l=df_xg_label_enc[['loan_amnt ', 'terms', 'Rate_of_intrst', 'grade', 'home_ownership',
       'annual_inc', 'verification_status', 'purpose', 'debt_income_ratio',
       'delinq_2yrs', 'inq_last_6mths', 'numb_credit', 'pub_rec',
       'total_credits', 'initial_list_status',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'collections_12_mths_ex_med',
       'application_type', 'acc_now_delinq', 'Experience',
       'mths_since_last_delinq', 'tot_curr_bal', 'tot_colle_amt']]
Y_xg_l=df_xg_label_enc[['total revol_bal']]

In [None]:
# let's separate into training and testing set

X_train_xg_l, X_test_xg_l, y_train_xg_l, y_test_xg_l = train_test_split(X_xg_l  ,  # predictors
                                                    Y_xg_l,  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train_xg_l.shape, X_test_xg_l.shape

In [None]:
xgb_mod_l=xgb.XGBRegressor()
params_l={
        'learning_rate':[0.03,0.05,0.08,0.10,0.15,0.20,0.25,0.30],
        'max_depth':[3,4,5,6,8,10,12,15],
        'min_child_weight':[1,3,5,7],
        'gamma':[0.0,0.1,0.2,0.3,0.4]
        }
random_search=RandomizedSearchCV(xgb_mod_l,param_distributions=params_l,n_iter=5,n_jobs=-1,cv=5,verbose=3)
random_search.fit(X_train_xg_l,y_train_xg_l)
random_search.best_estimator_
random_search.best_params_

In [None]:
xgb_mod_l=xgb.XGBRegressor(min_child_weight=3, max_depth=6, learning_rate=0.15, gamma=0.4)
xgb_mod_l.fit(X_train_xg_l,y_train_xg_l)


In [None]:
xg_pred_train_l=xgb_mod_l.predict(X_train_xg_l) 
xg_train_l_r2=rsq(y_train_xg_l,xg_pred_train_l)            
xg_train_l_r2

In [None]:
xg_pred_test_l=xgb_mod_l.predict(X_test_xg_l)
xg_test_l_r2=rsq(y_test_xg_l,xg_pred_test_l)
xg_test_l_r2

In [None]:
print(' train RMSE :', np.sqrt(metrics.mean_squared_error(y_train_xg_l,xg_pred_train_l)))
print(' test RMSE :', np.sqrt(metrics.mean_squared_error(y_test_xg_l,xg_pred_test_l)))

In [None]:
k_xg_l=list(xgb_mod_l.feature_importances_)
j_xg_l=list(X_train_xg_l.columns)
data_xg_l ={'columns': j_xg_l, 'feature importance':k_xg_l}
d_xg_l=pd.DataFrame(data_xg_l)
d_xg_l

In [None]:
comparision=y_test_xg_l.copy()
comparision.insert(loc=1,column='prediction',value=xg_pred_test_l)
comparision

## xg-boost with frequency encoder 

In [None]:
ordinal_enc = CountFrequencyCategoricalEncoder(
    encoding_method='frequency',
    variables=categorical)

df_xg_fre_enc = ordinal_enc.fit_transform(df_xg)

In [None]:
X_xg_f=df_xg_fre_enc[['loan_amnt ', 'terms', 'Rate_of_intrst', 'grade', 'home_ownership',
       'annual_inc', 'verification_status', 'purpose', 'debt_income_ratio',
       'delinq_2yrs', 'inq_last_6mths', 'numb_credit', 'pub_rec',
       'total_credits', 'initial_list_status',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'collections_12_mths_ex_med',
       'application_type', 'acc_now_delinq', 'Experience',
       'mths_since_last_delinq', 'tot_curr_bal', 'tot_colle_amt']]
Y_xg_f=df_xg_fre_enc[['total revol_bal']]

In [None]:
# let's separate into training and testing set

X_train_xg_f, X_test_xg_f, y_train_xg_f, y_test_xg_f = train_test_split(X_xg_f  ,  # predictors
                                                    Y_xg_f,  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train_xg_f.shape, X_test_xg_f.shape

In [None]:
xgb_mod_f=xgb.XGBRegressor()
params_f={
        'learning_rate':[0.03,0.05,0.08,0.10,0.15,0.20,0.25,0.30],
        'max_depth':[3,4,5,6,8,10,12,15],
        'min_child_weight':[1,3,5,7],
        'gamma':[0.0,0.1,0.2,0.3,0.4]
        }
random_search_f=RandomizedSearchCV(xgb_mod_f,param_distributions=params_f,n_iter=5,n_jobs=-1,cv=5,verbose=3)
random_search_f.fit(X_train_xg_f,y_train_xg_f)
random_search_f.best_estimator_
random_search_f.best_params_

In [None]:
xgb_mod_f=xgb.XGBRegressor(min_child_weight=5, max_depth=3, learning_rate=0.08, gamma=0.1)
xgb_mod_f.fit(X_train_xg_f,y_train_xg_f)


In [None]:
xg_pred_train_f=xgb_mod_f.predict(X_train_xg_f) 
xg_train_f_r2=rsq(y_train_xg_f,xg_pred_train_f)            
xg_train_f_r2

In [None]:
xg_pred_test_f=xgb_mod_f.predict(X_test_xg_f)
xg_test_f_r2=rsq(y_test_xg_f,xg_pred_test_f)
xg_test_f_r2

In [None]:
print(' train RMSE :', np.sqrt(metrics.mean_squared_error(y_train_xg_f,xg_pred_train_f)))
print(' test RMSE :', np.sqrt(metrics.mean_squared_error(y_test_xg_f,xg_pred_test_f)))

In [None]:
k_xg_f=list(xgb_mod_f.feature_importances_)
j_xg_f=list(X_train_xg_f.columns)
data_xg_f ={'columns': j_xg_f, 'feature importance':k_xg_f}
d_xg_f=pd.DataFrame(data_xg_f)
d_xg_f

In [None]:
comparision_f=y_test_xg_f.copy()
comparision_f.insert(loc=1,column='prediction',value=xg_pred_test_f)
comparision_f

### End of Model Build and Anlaysis (**Complete representation in PPT)
  - Linear Regression: 
        With a train/test split of 70/30 ,R-squared and RMSE values are 0.261( both NULL and NOT NULL) and 18947.63(NULL),18851.73(NOT NULL) respectively.
        #The RMSE values are pretty close and could be cosidered a good fit but not until other models are proven otherwise.
  - Random Forest: 
        RF model did not prove to be a good model considering the variation in the R-sqaured & RMSE values on 4 different types of models.
        The best model amongst the 4 was NOT NULL datatype, ONLY NUMERICAL variables, R-squared values being 0.66(TRAIN) & 0.33(TEST) and RMSE values being 13296(TRAIN) & 17496(TEST).
  - XGBoost :
        The best model amongst all types as the RMSE values were pretty close and the R-squared values were fairly reasonable considering the type of data provided.
    
    

    Finally, a model is chosen to proceed with the final build and eventually deploy a web application predicting the revolving balance.    

    