In [1]:
#preprocessing tools
import pandas as pd
import numpy as np

#EDA Tools
import pandas_profiling
%matplotlib inline
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

#modeling
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.svm import SVR , LinearSVR


## Hyperparameter optimization using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV


import warnings
warnings.filterwarnings('ignore')

## Preprocessing

In [2]:
train_data=pd.read_csv('CreditScore_train.csv')

In [3]:
test_data=pd.read_csv('CreditScore_test.csv')

In [4]:
train_data.shape

(80000, 305)

In [5]:
test_data.shape

(20000, 305)

In [6]:
data=pd.concat([train_data,test_data],axis='rows',sort=False,ignore_index=False)

In [7]:
data.head()

Unnamed: 0,x001,x002,x003,x004,x005,x006,x007,x008,x009,x010,...,x296,x297,x298,x299,x300,x301,x302,x303,x304,y
0,1084094,426.0,39.0,128.0,426.0,0,0,0,0,0,...,0,,0,0,0,0,,0,,807
1,1287777,160.0,2.0,64.0,160.0,1,1,2,0,1,...,17318,0.8417,1,1,1,0,,0,,819
2,1483016,163.0,16.0,104.0,239.0,0,0,0,1,0,...,0,,0,0,0,0,,0,,803
3,959054,,,,102.0,0,0,0,0,0,...,0,,1,1,1,0,,0,,530
4,1342113,3.0,2.0,2.0,62.0,0,2,2,0,0,...,17413,1.018,1,1,1,0,,0,,485


In [8]:
data.columns

Index(['x001', 'x002', 'x003', 'x004', 'x005', 'x006', 'x007', 'x008', 'x009',
       'x010',
       ...
       'x296', 'x297', 'x298', 'x299', 'x300', 'x301', 'x302', 'x303', 'x304',
       'y'],
      dtype='object', length=305)

In [9]:
data.info(verbose=True,null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 19999
Data columns (total 305 columns):
x001    100000 non-null int64
x002    78568 non-null float64
x003    78568 non-null float64
x004    78576 non-null float64
x005    93890 non-null float64
x006    100000 non-null int64
x007    100000 non-null int64
x008    100000 non-null int64
x009    100000 non-null int64
x010    100000 non-null int64
x011    100000 non-null int64
x012    100000 non-null int64
x013    100000 non-null int64
x014    100000 non-null int64
x015    100000 non-null int64
x016    100000 non-null int64
x017    100000 non-null int64
x018    100000 non-null int64
x019    100000 non-null int64
x020    100000 non-null int64
x021    100000 non-null int64
x022    100000 non-null int64
x023    100000 non-null int64
x024    100000 non-null int64
x025    100000 non-null int64
x026    100000 non-null int64
x027    100000 non-null int64
x028    100000 non-null int64
x029    100000 non-null int64
x030    100000 

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 19999
Columns: 305 entries, x001 to y
dtypes: float64(41), int64(264)
memory usage: 233.5 MB


## Multicollinearity

In [11]:
drop_columns=['x017','x030','x065','x072','x105','x115','x120','x125','x127','x128','x129','x131','x135','x139','x141','x142','x143','x144','x146','x150','x152','x157','x159','x165','x167','x172','x173','x178','x179','x187','x188','x189','x191','x192','x195','x201','x203','x204','x206','x207','x208','x210','x212','x214','x216','x218','x219','x221','x223','x225','x227','x231','x246','x249','x250','x257','x261','x267','x270','x273','x276','x278','x281','x283','x290','x295','x297','x299','x067','x094','x095','x096','x073','x074','x116','x121','x137','x138','x140','x153','x160','x196','x202','x209','x213','x215','x237','x239','x247','x262','x271','x277','x279','x284','x300','x130','x220','x292']
data.drop(columns=drop_columns,inplace=True)

In [12]:
data.shape

(100000, 207)

## Remove Missing Value Greater Than 25%

In [13]:
drop_columns=['x304','x302','x293','x289','x288','x275','x268','x266','x265','x259','x256','x255','x253','x242','x238','x222',
             'x162','x155','x148','x098','x058','x057','x293','x041']
data.drop(columns=drop_columns,inplace=True)

In [14]:
data.shape

(100000, 184)

In [15]:
data.columns

Index(['x001', 'x002', 'x003', 'x004', 'x005', 'x006', 'x007', 'x008', 'x009',
       'x010',
       ...
       'x285', 'x286', 'x287', 'x291', 'x294', 'x296', 'x298', 'x301', 'x303',
       'y'],
      dtype='object', length=184)

## Remove Unique Values

In [16]:
data.drop(columns=['x001'],inplace = True)

### x002,x003,x004,x005 columns are Positively skewed

In [17]:
data["x002"].fillna(data["x002"].median(),inplace = True)

In [18]:
data["x003"].fillna(data["x003"].median(),inplace = True)

In [19]:
data["x004"].fillna(data["x004"].median(),inplace = True)

In [20]:
data["x005"].fillna(data["x005"].median(),inplace = True)

In [21]:
data["x044"].fillna(data["x044"].mean(),inplace = True)

In [22]:
data["x045"].fillna(data["x045"].median(),inplace = True)

In [23]:
data["x234"].fillna(data["x234"].mean(),inplace = True)

In [24]:
data["x235"].fillna(data["x235"].median(),inplace = True)

In [25]:
data["x272"].fillna(data["x272"].mean(),inplace = True)

In [26]:
data["x287"].fillna(data["x287"].mode()[0],inplace = True)

In [27]:
data.info(verbose=True,null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 19999
Data columns (total 183 columns):
x002    100000 non-null float64
x003    100000 non-null float64
x004    100000 non-null float64
x005    100000 non-null float64
x006    100000 non-null int64
x007    100000 non-null int64
x008    100000 non-null int64
x009    100000 non-null int64
x010    100000 non-null int64
x011    100000 non-null int64
x012    100000 non-null int64
x013    100000 non-null int64
x014    100000 non-null int64
x015    100000 non-null int64
x016    100000 non-null int64
x018    100000 non-null int64
x019    100000 non-null int64
x020    100000 non-null int64
x021    100000 non-null int64
x022    100000 non-null int64
x023    100000 non-null int64
x024    100000 non-null int64
x025    100000 non-null int64
x026    100000 non-null int64
x027    100000 non-null int64
x028    100000 non-null int64
x029    100000 non-null int64
x031    100000 non-null int64
x032    100000 non-null int64
x033    100

## Explorartory Data Analysis(EDA)

In [28]:
pd.options.display.float_format = '{:,.3f}'.format
corr_result=pd.DataFrame(data[data.columns[1:]].corr()['y'][:]).T

In [29]:
for i in data.columns[0:]:
    print(i, data[i].corr(data['y']))

x002 0.4689602247914201
x003 0.16795639856268593
x004 0.41818507241409303
x005 0.5729257045894965
x006 0.09214381702297549
x007 0.008622644375465796
x008 -0.02192000049984312
x009 -0.0015218572363849916
x010 0.04365350422033073
x011 0.11311063173901759
x012 0.11755835818544343
x013 0.1781628123399855
x014 0.47434247497310805
x015 0.30884224553895484
x016 0.19417175079948346
x018 0.41046217683451697
x019 0.4132837884376316
x020 0.4156879660656005
x021 0.24280364327524007
x022 0.5688841954308335
x023 0.48063920134477306
x024 -0.059220687575074794
x025 0.4873764961489122
x026 0.2125082722263131
x027 0.42420611785102147
x028 0.4147597589645361
x029 0.2178318476037858
x031 -0.2800987212723916
x032 -0.07771866302967714
x033 -0.11286844474446285
x034 -0.1916940682258981
x035 -0.25484206741708915
x036 -0.31205465151322226
x037 -0.09127003239733891
x038 -0.12503725781646266
x039 -0.2165530122050227
x040 -0.2917141169098679
x042 0.29336312577587936
x043 0.3684458764992495
x044 0.1893824496321845

In [30]:
data.corr()

Unnamed: 0,x002,x003,x004,x005,x006,x007,x008,x009,x010,x011,...,x285,x286,x287,x291,x294,x296,x298,x301,x303,y
x002,1.000,0.308,0.833,0.718,0.107,-0.099,-0.120,-0.052,0.003,0.066,...,-0.007,-0.048,-0.184,0.059,-0.004,0.072,-0.072,-0.072,0.021,0.469
x003,0.308,1.000,0.711,0.248,0.212,-0.206,-0.226,-0.155,-0.119,-0.097,...,-0.111,-0.002,-0.033,-0.089,-0.005,-0.101,-0.185,-0.010,-0.038,0.168
x004,0.833,0.711,1.000,0.612,0.202,-0.228,-0.263,-0.171,-0.105,-0.038,...,-0.068,-0.029,-0.143,-0.023,-0.001,-0.026,-0.179,-0.049,-0.004,0.418
x005,0.718,0.248,0.612,1.000,0.131,-0.048,-0.062,-0.015,0.034,0.106,...,-0.048,-0.063,-0.212,0.050,-0.015,0.067,-0.051,-0.092,-0.009,0.573
x006,0.107,0.212,0.202,0.131,1.000,-0.129,-0.112,-0.007,0.026,0.056,...,-0.015,0.027,0.032,-0.012,0.014,-0.022,-0.008,-0.004,-0.009,0.092
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
x296,0.072,-0.101,-0.026,0.067,-0.022,0.160,0.188,0.198,0.201,0.241,...,0.452,-0.015,-0.020,0.886,0.066,1.000,0.334,0.088,0.609,0.110
x298,-0.072,-0.185,-0.179,-0.051,-0.008,0.215,0.269,0.282,0.277,0.287,...,0.461,0.141,0.312,0.353,0.145,0.334,1.000,0.147,0.242,-0.044
x301,-0.072,-0.010,-0.049,-0.092,-0.004,0.003,0.008,0.016,0.022,0.076,...,0.237,0.261,0.351,0.151,0.163,0.088,0.147,1.000,0.260,-0.224
x303,0.021,-0.038,-0.004,-0.009,-0.009,0.091,0.105,0.126,0.143,0.213,...,0.578,0.122,0.108,0.688,0.360,0.609,0.242,0.260,1.000,-0.035


## Remove unrelated and weak relationship 05 to -0.5

In [31]:
removeValue = [];
for i in data.columns[0:]:
    if (data[i].corr(data['y']) <= 0.1 and data[i].corr(data['y']) >= -0.1):
        removeValue.append(i)

In [32]:
##plt.figure(figsize=(25,10))
##sns.heatmap(data.corr(),annot=True)

In [33]:
data.drop(columns=removeValue,inplace=True)

In [34]:
data.shape

(100000, 139)

In [35]:
##sns.heatmap(data.corr(),annot=True)

In [36]:
data.corr()

Unnamed: 0,x002,x003,x004,x005,x011,x012,x013,x014,x015,x016,...,x263,x264,x274,x282,x286,x287,x294,x296,x301,y
x002,1.000,0.308,0.833,0.718,0.066,0.098,0.158,0.518,0.276,0.235,...,0.021,0.042,0.009,0.077,-0.048,-0.184,-0.004,0.072,-0.072,0.469
x003,0.308,1.000,0.711,0.248,-0.097,-0.056,-0.025,0.073,-0.092,-0.151,...,-0.177,-0.154,0.033,0.074,-0.002,-0.033,-0.005,-0.101,-0.010,0.168
x004,0.833,0.711,1.000,0.612,-0.038,0.021,0.087,0.396,0.102,0.040,...,-0.112,-0.090,0.023,0.073,-0.029,-0.143,-0.001,-0.026,-0.049,0.418
x005,0.718,0.248,0.612,1.000,0.106,0.132,0.200,0.613,0.367,0.202,...,0.076,0.076,0.020,0.167,-0.063,-0.212,-0.015,0.067,-0.092,0.573
x011,0.066,-0.097,-0.038,0.106,1.000,0.480,0.320,0.245,0.664,0.451,...,0.237,0.220,0.034,0.222,0.098,0.108,0.077,0.241,0.076,0.113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
x287,-0.184,-0.033,-0.143,-0.212,0.108,0.141,0.090,-0.116,0.046,-0.045,...,0.023,-0.009,0.135,0.078,0.195,1.000,0.155,-0.020,0.351,-0.432
x294,-0.004,-0.005,-0.001,-0.015,0.077,0.112,0.103,0.059,0.111,0.146,...,0.095,0.095,0.134,0.062,0.225,0.155,1.000,0.066,0.163,-0.112
x296,0.072,-0.101,-0.026,0.067,0.241,0.238,0.236,0.220,0.357,0.414,...,0.318,0.458,0.003,0.144,-0.015,-0.020,0.066,1.000,0.088,0.110
x301,-0.072,-0.010,-0.049,-0.092,0.076,0.210,0.203,0.019,0.123,0.076,...,0.002,-0.013,0.066,0.012,0.261,0.351,0.163,0.088,1.000,-0.224


## Modeling

In [None]:
data.to_excel('Credit_Score_Prediction.xlsx',index=False)

In [38]:

x=data.drop(columns=['y'])

In [39]:
y=data['y']

In [40]:
x.shape

(100000, 138)

In [41]:
y.shape

(100000,)

In [42]:
x.head()

Unnamed: 0,x002,x003,x004,x005,x011,x012,x013,x014,x015,x016,...,x260,x263,x264,x274,x282,x286,x287,x294,x296,x301
0,426.0,39.0,128.0,426.0,2,4,4,9,19,5,...,1,0,0,0,1,0,1.0,0,0,0
1,160.0,2.0,64.0,160.0,3,3,7,5,21,5,...,1,1,17318,0,1,0,1.0,0,17318,0
2,163.0,16.0,104.0,239.0,1,0,0,6,8,3,...,0,0,0,0,1,0,1.0,0,0,0
3,100.0,8.0,48.0,102.0,0,2,2,0,4,0,...,0,0,0,619,1,0,9.0,0,0,0
4,3.0,2.0,2.0,62.0,1,2,0,0,5,2,...,0,1,17413,21424,1,0,9.0,0,17413,0


In [43]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=1)

In [44]:
x_train.shape

(70000, 138)

In [45]:
x_test.shape

(30000, 138)

In [46]:
y_train.shape

(70000,)

In [47]:
y_test.shape

(30000,)

In [48]:
rmse_train=[];
rmse_test=[];
model_name=[];
def modeling(model,x_train,x_test,y_train,y_test,name):
    model.fit(x_train,y_train)
    y_train_predicated=model.predict(x_train);
    train_error=np.sqrt(mean_squared_error(y_train,y_train_predicated));
    print(train_error)
    y_test_predicated=model.predict(x_test);
    test_error=np.sqrt(mean_squared_error(y_test,y_test_predicated))
    print(test_error)
    rmse_train.append(train_error)
    rmse_test.append(test_error)
    model_name.append(name)

In [49]:
linear = LinearRegression()

In [50]:
modeling(linear,x_train,x_test,y_train,y_test,'LinearRegression')

51.790786748384875
52.75256365655211


In [51]:
lasso = Lasso(alpha=10,normalize = True)

In [52]:
modeling(lasso,x_train,x_test,y_train,y_test,'Lasso')

118.25614269215681
118.93903155719204


In [53]:
ridge = Ridge()
modeling(ridge,x_train,x_test,y_train,y_test,'Ridge')

51.79078829985354
52.75252473503327


In [54]:
elastic = ElasticNet(alpha=0.4,l1_ratio=0.5)
modeling(elastic,x_train,x_test,y_train,y_test,'elastic')

54.720276806309755
55.734975123185706


In [55]:
adaBoost = AdaBoostRegressor()
modeling(adaBoost,x_train,x_test,y_train,y_test,'AdaBoostRegressor')

50.2266016755228
50.81145822471138


In [56]:
random = RandomForestRegressor()
modeling(random,x_train,x_test,y_train,y_test,'RandomForestRegressor')

13.858063352107745
32.54163957290094


In [57]:
gradientBoosting = GradientBoostingRegressor()
modeling(gradientBoosting,x_train,x_test,y_train,y_test,'GradientBoostingRegressor')

33.77146801643276
34.57225873106589


In [58]:
xgBoost=xgb.XGBRegressor(objective ='reg:squarederror')
modeling(xgBoost,x_train,x_test,y_train,y_test,'xgBoost')

33.75592772667364
34.570071776524934


In [59]:
lgb=LGBMRegressor()
modeling(lgb,x_train,x_test,y_train,y_test,'LGBMRegressor')

28.325486349419112
30.14822476506468


In [60]:
##svm=LinearSVR()
##modeling(svm,x_train,x_test,y_train,y_test,'LinearSVR')

In [61]:
def finalResult():
    result = pd.DataFrame([model_name,rmse_train,rmse_test]).T
    result.columns = ["Model_Name", "Train", "Test"]
    result.to_excel('Model.xlsx',index=False)
    return result;

In [62]:
result = finalResult()

In [63]:
result.sort_values(by='Train', ascending=True, na_position='first')

Unnamed: 0,Model_Name,Train,Test
5,RandomForestRegressor,13.858,32.542
8,LGBMRegressor,28.325,30.148
7,xgBoost,33.756,34.57
6,GradientBoostingRegressor,33.771,34.572
4,AdaBoostRegressor,50.227,50.811
0,LinearRegression,51.791,52.753
2,Ridge,51.791,52.753
3,elastic,54.72,55.735
1,Lasso,118.256,118.939


## HyperTunning

In [63]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Various learning rate parameters
learning_rate = ['0.05','0.1', '0.2','0.3','0.5','0.6']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
#Subssample parameter values
subsample=[0.7,0.6,0.8]
# Minimum child weight parameters
min_child_weight=[3,4,5,6,7]

In [64]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'max_depth': max_depth,
               'subsample': subsample,
               'min_child_weight': min_child_weight}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'learning_rate': ['0.05', '0.1', '0.2', '0.3', '0.5', '0.6'], 'max_depth': [5, 10, 15, 20, 25, 30], 'subsample': [0.7, 0.6, 0.8], 'min_child_weight': [3, 4, 5, 6, 7]}


In [65]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
regressor=xgb.XGBRegressor()

In [66]:
# search across 100 different combinations
xg_random = RandomizedSearchCV(estimator = regressor, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [67]:
xg_random.fit(x_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] subsample=0.7, n_estimators=1100, min_child_weight=6, max_depth=10, learning_rate=0.3 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  subsample=0.7, n_estimators=1100, min_child_weight=6, max_depth=10, learning_rate=0.3, total=16.9min
[CV] subsample=0.7, n_estimators=1100, min_child_weight=6, max_depth=10, learning_rate=0.3 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 16.9min remaining:    0.0s


[CV]  subsample=0.7, n_estimators=1100, min_child_weight=6, max_depth=10, learning_rate=0.3, total=17.6min
[CV] subsample=0.7, n_estimators=1100, min_child_weight=6, max_depth=10, learning_rate=0.3 
[CV]  subsample=0.7, n_estimators=1100, min_child_weight=6, max_depth=10, learning_rate=0.3, total=27.4min
[CV] subsample=0.7, n_estimators=1100, min_child_weight=6, max_depth=10, learning_rate=0.3 
[CV]  subsample=0.7, n_estimators=1100, min_child_weight=6, max_depth=10, learning_rate=0.3, total=38.9min
[CV] subsample=0.7, n_estimators=1100, min_child_weight=6, max_depth=10, learning_rate=0.3 
[CV]  subsample=0.7, n_estimators=1100, min_child_weight=6, max_depth=10, learning_rate=0.3, total=30.7min
[CV] subsample=0.7, n_estimators=600, min_child_weight=6, max_depth=20, learning_rate=0.3 
[CV]  subsample=0.7, n_estimators=600, min_child_weight=6, max_depth=20, learning_rate=0.3, total=37.5min
[CV] subsample=0.7, n_estimators=600, min_child_weight=6, max_depth=20, learning_rate=0.3 
[CV]  su

[CV]  subsample=0.8, n_estimators=100, min_child_weight=3, max_depth=10, learning_rate=0.3, total= 1.8min
[CV] subsample=0.6, n_estimators=500, min_child_weight=6, max_depth=5, learning_rate=0.3 
[CV]  subsample=0.6, n_estimators=500, min_child_weight=6, max_depth=5, learning_rate=0.3, total= 4.7min
[CV] subsample=0.6, n_estimators=500, min_child_weight=6, max_depth=5, learning_rate=0.3 
[CV]  subsample=0.6, n_estimators=500, min_child_weight=6, max_depth=5, learning_rate=0.3, total= 4.4min
[CV] subsample=0.6, n_estimators=500, min_child_weight=6, max_depth=5, learning_rate=0.3 
[CV]  subsample=0.6, n_estimators=500, min_child_weight=6, max_depth=5, learning_rate=0.3, total= 4.4min
[CV] subsample=0.6, n_estimators=500, min_child_weight=6, max_depth=5, learning_rate=0.3 
[CV]  subsample=0.6, n_estimators=500, min_child_weight=6, max_depth=5, learning_rate=0.3, total= 4.4min
[CV] subsample=0.6, n_estimators=500, min_child_weight=6, max_depth=5, learning_rate=0.3 
[CV]  subsample=0.6, n_e

KeyboardInterrupt: 

In [69]:
xg_random.best_estimator_

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_estimator_'