In [122]:
# importing the required packages
import numpy as np
import pandas as pd

In [123]:
# reading a CSV file directly from github  and assigning to a Pandas DataFrame:
credit_df = pd.read_csv("https://github.com/mpourhoma/CS4661/raw/master/Credit.csv")
credit_df[0::10]

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Married,Balance
0,14.891,3606,283,2,34,11,1,333
10,63.095,8117,589,4,30,14,1,1407
20,17.7,2860,235,4,63,16,0,89
30,34.142,5666,413,4,47,5,1,863
40,34.95,3327,253,3,54,14,0,50
50,36.362,5183,376,3,49,15,1,654
60,35.51,5198,364,2,35,20,0,631
70,24.889,3954,318,4,75,12,1,357
80,27.369,3449,288,3,40,9,1,162
90,20.191,5767,431,4,42,16,1,1023


In [124]:
#Creating the feature matrix

#python list of feature names to use
feature_cols=['Income', 'Limit', 'Rating', 'Cards', 'Age', 'Education', 'Married']

#assigning list of selected features above to X
X = credit_df[feature_cols]

X.head()

# Creating the target vector with 'Balance' 
y = credit_df['Balance']
y[0::10]

0       333
10     1407
20       89
30      863
40       50
50      654
60      631
70      357
80      162
90     1023
100     298
110      47
120       0
130     868
140    1425
150     581
160     836
170       0
180     423
190     538
200     829
210      95
220    1246
230     661
240     148
250       0
260     345
270     136
280     541
290     159
300     580
310    1036
320       5
330    1054
340     320
350       0
360     712
370     992
380     661
390    1393
Name: Balance, dtype: int64

In [125]:
#Normalizing/scaling the feature data
from sklearn.preprocessing import scale

X_scale = scale(X)
print(X_scale)


[[-0.86158299 -0.48999879 -0.46553881 ... -1.2576741  -0.78492991
   0.79539491]
 [ 1.72743711  0.82826106  0.82870309 ...  1.5284506   0.49658831
   0.79539491]
 [ 1.68675551  1.01478681  1.02931059 ...  0.88996369 -0.78492991
  -1.25723711]
 ...
 [ 0.35946155 -0.24491264 -0.21963285 ...  0.65778663 -0.46455035
   0.79539491]
 [-0.21280808 -0.95891584 -1.05441888 ... -0.67723146 -0.1441708
   0.79539491]
 [-0.75334493  0.34199278  0.38866085 ...  0.48365384 -2.06644812
  -1.25723711]]


In [126]:
#To split the dataset into testing and training sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split (X_scale, y, test_size=0.24, random_state=9)


In [127]:
#Training a linear regression model on the dataset
from sklearn.linear_model import LinearRegression

#credit_linreg is instantiated as an object of Linear Regression class
credit_linreg = LinearRegression()

#fitting the model to the training data
credit_linreg.fit(X_train, y_train)

#Checking the coefficients
print(credit_linreg.intercept_)
print(credit_linreg.coef_)

#As we can see, the 3 most important features seem to be Limit, Rating and Age
#The least important features are 'Cards' and 'Married'

520.0391034603759
[-264.69553256  360.5601294   252.73642988   24.91707959  -17.23897686
   11.65810466   -9.82859046]


In [128]:
#Predicting 'balance' for the users in the testing set
y_prediction = credit_linreg.predict(X_test)
print(y_prediction)

[ 316.89813114  677.5093907   489.03669272  -49.82989971  706.42913216
  864.96954115  897.65752581  108.20297732 -213.73474279  834.94198677
   11.05109139 -227.34447869  472.01450822    4.54554458 -228.17349827
  956.21537845  896.91570524  821.97824267   18.37144232  857.31517013
 1028.30326279  699.22168497 1176.9426756   666.28578465  643.94064891
  690.63628696  589.35273903  -29.37237394  575.15444917  406.78468032
  845.81453327  828.30862904   82.2376206   953.40592514 -184.74828227
  525.84335889 1045.19205022  535.04104472  107.26753118 -172.8968111
  497.84037664 1155.35533939  429.08674601  403.49872552  143.72087313
  764.78737012  431.24114308 1304.79656588  475.00132057   95.75404622
 -215.69000296    5.62900657  313.88797405  859.9328932   792.27147016
 1029.90210771 1500.40180297 1078.48617624  713.37981473 1072.97948262
  -64.12187718  488.60063387   92.3601712   440.414465    277.13284667
 1075.41508762  258.10181743  773.89657462  278.12447916  409.51029492
  335.5

In [129]:
#To calculate the RMSE:

from sklearn import metrics

#calculating "Mean Square Error" (MSE):
mse = metrics.mean_squared_error(y_test, y_prediction)

#Using the numpy square function to take square root and get the root mean square error (RMSE):
rmse = np.sqrt(mse)

print(rmse)

143.24649739642265


In [130]:
#Using 10-fold cross validation to evaluate the performance of linear regression in predicting the balance
# importing the method:
from sklearn.model_selection import cross_val_score

mse_list = cross_val_score(credit_linreg, X_scale, y, cv=10, scoring='neg_mean_squared_error')

print(mse_list)

[-23646.90415343 -32003.04401232 -35462.64435619 -37327.60719635
 -14341.32205939 -33628.37104224 -31631.99317834 -12491.00334951
 -20749.61212175 -23204.94743459]


In [131]:
#Since the mse is an error, we get negative values. So before we can calculate the rmse, we have to make the values positive
mse_list_pos = -mse_list

#calculating the rmse with sqrt function:
rmse_list = np.sqrt(mse_list_pos)
print(rmse_list)

[153.7754992  178.89394627 188.31527914 193.20353826 119.75525901
 183.38039983 177.85385343 111.76315739 144.04725656 152.331702  ]


In [132]:
#Getting the average rmse for the final result in cross validation:
print(rmse_list.mean())

160.33198910744133
