In [56]:
import pandas as pd
import numpy as np
# pd.set_option()
import matplotlib.pyplot as plt


pd.set_option('display.max_columns',50)

train = pd.read_csv('./Data/Yes_ml3_Bank_Train.csv')

In [57]:
train.shape

(800, 21)

In [58]:
train.isnull().sum()

serial number       0
account_info        0
duration_month      0
credit_history      0
purpose             0
credit_amount       0
savings_account     0
employment_st       0
poi                 0
personal_status     0
gurantors           0
resident_since      0
property_type       0
age                 0
installment_type    0
housing_type        0
credits_no          0
job_type            0
liables             0
telephone           0
foreigner           0
dtype: int64

In [59]:
train.dtypes.sort_values()

serial number        int64
liables              int64
duration_month       int64
credits_no           int64
credit_amount        int64
poi                  int64
age                  int64
resident_since       int64
job_type            object
housing_type        object
installment_type    object
property_type       object
gurantors           object
personal_status     object
employment_st       object
savings_account     object
purpose             object
credit_history      object
account_info        object
telephone           object
foreigner           object
dtype: object

In [60]:
train.select_dtypes(include=['object']).apply(lambda x: (x.nunique(),sorted(x.unique())))

account_info                                (4, [A11, A12, A13, A14])
credit_history                         (5, [A30, A31, A32, A33, A34])
purpose             (10, [A40, A41, A410, A42, A43, A44, A45, A46,...
savings_account                        (5, [A61, A62, A63, A64, A65])
employment_st                          (5, [A71, A72, A73, A74, A75])
personal_status                             (4, [A91, A92, A93, A94])
gurantors                                     (3, [A101, A102, A103])
property_type                           (4, [A121, A122, A123, A124])
installment_type                              (3, [A141, A142, A143])
housing_type                                  (3, [A151, A152, A153])
job_type                                (4, [A171, A172, A173, A174])
telephone                                           (2, [A191, A192])
foreigner                                           (2, [A201, A202])
dtype: object

In [61]:
for col in train.select_dtypes(include=['object']):
    print(train.groupby(col).count()['serial number'].sort_values(ascending=False))
    print('-'*40)


account_info
A14    310
A12    226
A11    209
A13     55
Name: serial number, dtype: int64
----------------------------------------
credit_history
A32    424
A34    235
A33     71
A31     37
A30     33
Name: serial number, dtype: int64
----------------------------------------
purpose
A43     223
A40     184
A42     144
A41      81
A49      77
A46      45
A45      19
A410     10
A44       9
A48       8
Name: serial number, dtype: int64
----------------------------------------
savings_account
A61    476
A65    142
A62     89
A63     51
A64     42
Name: serial number, dtype: int64
----------------------------------------
employment_st
A73    275
A75    203
A74    141
A72    133
A71     48
Name: serial number, dtype: int64
----------------------------------------
personal_status
A93    437
A92    255
A94     70
A91     38
Name: serial number, dtype: int64
----------------------------------------
gurantors
A101    728
A103     42
A102     30
Name: serial number, dtype: int64
---------------

### Treating categorical variables

In [62]:
train.foreigner = train.foreigner.replace('A201',1)
train.foreigner = train.foreigner.replace('A202',0)
train.foreigner = train.foreigner.astype(np.int16)
train.foreigner.dtype

dtype('int16')

In [63]:
train.telephone = train.telephone.replace('A191',1)
train.telephone = train.telephone.replace('A192',0)
train.telephone = train.telephone.astype(np.int16)
train.telephone.dtype

dtype('int16')

In [64]:
# using the one hot encoding for rest categorical variables
remaining_cat_variables =train.select_dtypes(include=['object'])
for col in remaining_cat_variables:
    dummies = pd.get_dummies(train[col])
    train = train.join(dummies)

In [65]:
train.columns

Index(['serial number', 'account_info', 'duration_month', 'credit_history',
       'purpose', 'credit_amount', 'savings_account', 'employment_st', 'poi',
       'personal_status', 'gurantors', 'resident_since', 'property_type',
       'age', 'installment_type', 'housing_type', 'credits_no', 'job_type',
       'liables', 'telephone', 'foreigner', 'A11', 'A12', 'A13', 'A14', 'A30',
       'A31', 'A32', 'A33', 'A34', 'A40', 'A41', 'A410', 'A42', 'A43', 'A44',
       'A45', 'A46', 'A48', 'A49', 'A61', 'A62', 'A63', 'A64', 'A65', 'A71',
       'A72', 'A73', 'A74', 'A75', 'A91', 'A92', 'A93', 'A94', 'A101', 'A102',
       'A103', 'A121', 'A122', 'A123', 'A124', 'A141', 'A142', 'A143', 'A151',
       'A152', 'A153', 'A171', 'A172', 'A173', 'A174'],
      dtype='object')

In [98]:
# considering only Numeric features for training
X = train.select_dtypes(exclude=['object'])
X.head()

Unnamed: 0,serial number,duration_month,credit_amount,poi,resident_since,age,credits_no,liables,telephone,foreigner,A11,A12,A13,A14,A30,A31,A32,A33,A34,A40,A41,A410,A42,A43,A44,...,A72,A73,A74,A75,A91,A92,A93,A94,A101,A102,A103,A121,A122,A123,A124,A141,A142,A143,A151,A152,A153,A171,A172,A173,A174
0,1,6,1169,4,4,67,2,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0
1,2,48,5951,2,2,22,1,1,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0
2,3,12,2096,2,3,49,1,2,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0
3,4,42,7882,2,4,45,1,2,1,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,0
4,5,24,4870,3,4,53,2,2,1,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0


In [99]:
Y = X.credit_amount
X.drop(['credit_amount'],axis=1,inplace=True)

-  our input dataset has many - binary varible and other with continuous values.
- Need to perform some NOrmalization to get these on the same scale

In [100]:
X.shape

(800, 59)

In [163]:
# X.set_index('serial number').head()

In [110]:
X[[ 'duration_month', 'poi', 'resident_since', 'age', 'credits_no',  'liables', ]].describe()

Unnamed: 0,duration_month,poi,resident_since,age,credits_no,liables
count,800.0,800.0,800.0,800.0,800.0,800.0
mean,20.65125,2.96625,2.84125,35.40625,1.39625,0.1475
std,12.15635,1.128806,1.106833,11.470317,0.569773,0.354825
min,4.0,1.0,1.0,19.0,1.0,0.0
25%,12.0,2.0,2.0,27.0,1.0,0.0
50%,18.0,3.0,3.0,33.0,1.0,0.0
75%,24.0,4.0,4.0,41.0,2.0,0.0
max,72.0,4.0,4.0,75.0,4.0,1.0


In [109]:
X.liables = X.liables -1

In [81]:
X['duration_month'].unique()

array([ 6, 48, 12, 42, 24, 36, 30, 15,  9, 10,  7, 60, 18, 45, 11, 27,  8,
       54, 20, 14, 33, 21, 16,  4, 47, 13, 22, 39, 28,  5, 26, 72, 40],
      dtype=int64)

In [103]:
X.groupby('liables').count()['serial number']

liables
1    682
2    118
Name: serial number, dtype: int64

In [96]:
# m = X.liables >=2
# # X.liables.where?
# X.liables.where(m, 1, inplace=True)


In [111]:
# dropping age and duraction_monht -- as these are on varying scale - probabily  some normalization can help

In [113]:
X.drop(['duration_month','age'], axis=1, inplace=True)

In [114]:
X.head()

Unnamed: 0,serial number,poi,resident_since,credits_no,liables,telephone,foreigner,A11,A12,A13,A14,A30,A31,A32,A33,A34,A40,A41,A410,A42,A43,A44,A45,A46,A48,...,A72,A73,A74,A75,A91,A92,A93,A94,A101,A102,A103,A121,A122,A123,A124,A141,A142,A143,A151,A152,A153,A171,A172,A173,A174
0,1,4,4,2,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0
1,2,2,2,1,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0
2,3,2,3,1,1,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0
3,4,2,4,1,1,1,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,0
4,5,3,4,2,1,1,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0


In [115]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [143]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.9, test_size=0.1)

In [165]:
# X_train.set_index('serial number', inplace=True)
# X_test.set_index('serial number', inplace=True)

In [145]:
X_train.head()

Unnamed: 0_level_0,poi,resident_since,credits_no,liables,telephone,foreigner,A11,A12,A13,A14,A30,A31,A32,A33,A34,A40,A41,A410,A42,A43,A44,A45,A46,A48,A49,...,A72,A73,A74,A75,A91,A92,A93,A94,A101,A102,A103,A121,A122,A123,A124,A141,A142,A143,A151,A152,A153,A171,A172,A173,A174
serial number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
351,1,4,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0
233,3,1,1,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0
105,2,4,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0
382,3,4,1,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1
46,4,4,2,0,1,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1


In [146]:
std= StandardScaler()

In [147]:
std.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [148]:
X_train = std.transform(X_train)

In [149]:
# X_train.describe()

AttributeError: 'numpy.ndarray' object has no attribute 'describe'

In [150]:
pca = PCA(0.95)
pca.fit(X_train)
X_train= pca.transform(X_train)


In [173]:
pd.DataFrame(X_test).shape

(80, 40)

In [154]:
X_test = std.transform(X_test)
X_test= pca.transform(X_test)


#### Testing different models

In [156]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [157]:
from sklearn import linear_model, svm, tree, ensemble

models = [
    linear_model.LinearRegression(),
    linear_model.Ridge(),
    linear_model.Lasso(),
    linear_model.ElasticNet(),
    linear_model.BayesianRidge(),
#     linear_model.RANSACRegressor(),
    svm.LinearSVR(),
    svm.SVR(),
    tree.DecisionTreeRegressor(),
    tree.ExtraTreeRegressor(),
    ensemble.RandomForestRegressor(),
    ensemble.GradientBoostingRegressor()

]

In [159]:
for model in models:
    model.fit(X_train,Y_train)
    print('-'*30)
    print(model.__class__.__name__)
    
    train_pred = model.predict(X_train)
    train_rmse = np.sqrt(mean_squared_error(Y_train, train_pred))
    print("Root mean square error train {}".format(train_rmse))    

    pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(Y_test, pred))
    print("Root mean square error test {}".format(rmse))

------------------------------
LinearRegression
Root mean square error train 2171.001205726628
Root mean square error test 2332.7077108608205
------------------------------
Ridge
Root mean square error train 2171.00172808751
Root mean square error test 2332.5731710027253
------------------------------
Lasso
Root mean square error train 2171.0092942750207
Root mean square error test 2331.775971197426
------------------------------
ElasticNet
Root mean square error train 2204.635506688674
Root mean square error test 2325.1429798407103
------------------------------
BayesianRidge
Root mean square error train 2178.3389928019474
Root mean square error test 2322.567550864441
------------------------------
LinearSVR
Root mean square error train 3686.8898389117176
Root mean square error test 3581.9746767360684
------------------------------
SVR
Root mean square error train 2880.260729234919
Root mean square error test 2797.092687627785
------------------------------
DecisionTreeRegressor
Root 

In [209]:
test = pd.read_csv('./Data/Yes_ml3_Bank_Test.csv')
serial_number = test[['serial number']]

In [211]:
serial_number.columns

Index(['serial number'], dtype='object')

In [212]:
test.isnull().sum()

serial number       0
account_info        0
duration_month      0
credit_history      0
purpose             0
savings_account     0
employment_st       0
poi                 0
personal_status     0
gurantors           0
resident_since      0
property_type       0
age                 0
installment_type    0
housing_type        0
credits_no          0
job_type            0
liables             0
telephone           0
foreigner           0
dtype: int64

In [213]:
test.foreigner = test.foreigner.replace('A201',1)
test.foreigner = test.foreigner.replace('A202',0)
test.foreigner = test.foreigner.astype(np.int16)
test.foreigner.dtype

dtype('int16')

In [214]:
test.telephone = test.telephone.replace('A191',1)
test.telephone = test.telephone.replace('A192',0)
test.telephone = test.telephone.astype(np.int16)
test.telephone.dtype

dtype('int16')

In [215]:
test.liables = test.liables -1

In [216]:
test.set_index('serial number', inplace=True)

In [217]:
remaining_cat_variables =test.select_dtypes(include=['object'])
for col in remaining_cat_variables:
    dummies = pd.get_dummies(test[col])
    test = test.join(dummies)

In [218]:
test = test.select_dtypes(exclude=['object'])

In [219]:
test.drop(['duration_month','age'], axis=1, inplace=True)

In [220]:
test = std.transform(test)
test = pca.transform(test)

In [221]:
pd.DataFrame(test).index

RangeIndex(start=0, stop=200, step=1)

In [222]:
# serial_numer = pd.DataFrame(pd.DataFrame(test).index)

In [223]:
# serial_numer.shape

In [224]:
test_pred = models[0].predict(test)   
result = pd.concat([serial_number['serial number'],pd.Series(test_pred, name='credit_amount')], axis =1,)
pd.DataFrame.to_csv(result,path_or_buf='./output/resultsML3_yes_{}.csv'.format(0),index=False)