In [20]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import math
import sklearn.metrics as sklm
import scipy.stats as ss
import pickle

In [21]:
df = pd.read_csv('Reg_AveMonthSpend.csv')
df.head()

Unnamed: 0,CustomerID,CountryRegionName,Education,Occupation,Gender,MaritalStatus,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,AveMonthSpend,Age,ChildrenOut
0,11000,Australia,Bachelors,Professional,M,M,0,0,2,137947,89,31,2
1,11001,Australia,Bachelors,Professional,M,S,1,3,3,101141,117,32,0
2,11002,Australia,Bachelors,Professional,M,M,1,3,3,91945,123,32,0
3,11003,Australia,Bachelors,Professional,F,S,1,0,0,86688,50,29,0
4,11004,Australia,Bachelors,Professional,F,S,4,5,5,92771,95,29,0


In [22]:
labels = np.array(df['AveMonthSpend'])
def encode_string(cat_features):
    ## First encode the strings to numeric categories
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_features)
    enc_cat_features = enc.transform(cat_features)
    ## Now, apply one hot encoding
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_features.reshape(-1,1))
    return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()

In [23]:
categorical_columns = ['Education', 'Occupation', 'Gender', 'MaritalStatus']
Features = encode_string(df['CountryRegionName'])
for col in categorical_columns:
    temp = encode_string(df[col])
    Features = np.concatenate([Features, temp], axis = 1)

print(Features.shape)
print(Features[:2, :])

(16404, 20)
[[1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0.]
 [1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1.]]


In [24]:
Features = np.concatenate([Features, np.array(df[['NumberCarsOwned', 
                            'NumberChildrenAtHome', 'YearlyIncome', 'Age', 'ChildrenOut']])], axis = 1)
print(Features.shape)
print(Features[:2, :])

(16404, 25)
[[1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00
  1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.37947e+05 3.10000e+01
  2.00000e+00]
 [1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 1.00000e+00 1.00000e+00 3.00000e+00 1.01141e+05 3.20000e+01
  0.00000e+00]]


In [25]:
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = int(0.2*df.shape[0]))
print(int(0.2*df.shape[0]))

3280


In [26]:
x_train = Features[indx[0],:]
y_train = np.ravel(labels[indx[0]])
x_test = Features[indx[1],:]
y_test = np.ravel(labels[indx[1]])

scaler = preprocessing.StandardScaler().fit(x_train[:,22:])
x_train[:,22:] = scaler.transform(x_train[:,22:])
x_test[:,22:] = scaler.transform(x_test[:,22:])
x_train[:2,]

array([[ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         2.        ,  5.        ,  0.70792206, -1.02409497, -0.7515763 ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ,  1.        ,  0.        ,
         4.        ,  3.        ,  1.88957031,  0.31347128, -0.7515763 ]])

In [27]:
lin_mod = linear_model.LinearRegression(fit_intercept = False)
lin_mod.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [28]:
def print_metrics(y_true, y_predicted, n_parameters):
    ## First compute R^2 and the adjusted R^2
    r2 = sklm.r2_score(y_true, y_predicted)
    r2_adj = r2 - (n_parameters - 1)/(y_true.shape[0] - n_parameters) * (1 - r2)
    
    ## Print the usual metrics and the R^2 values
    print('Mean Square Error      = ' + str(sklm.mean_squared_error(y_true, y_predicted)))
    print('Root Mean Square Error = ' + str(math.sqrt(sklm.mean_squared_error(y_true, y_predicted))))
    print('Mean Absolute Error    = ' + str(sklm.mean_absolute_error(y_true, y_predicted)))
    print('Median Absolute Error  = ' + str(sklm.median_absolute_error(y_true, y_predicted)))
    print('R^2                    = ' + str(r2))
    print('Adjusted R^2           = ' + str(r2_adj))
   
y_score = lin_mod.predict(x_test) 
print_metrics(y_test, y_score, 28)

Mean Square Error      = 41.22353063519226
Root Mean Square Error = 6.420555321402679
Mean Absolute Error    = 4.871783484482184
Median Absolute Error  = 3.8228759765625
R^2                    = 0.9463644698312902
Adjusted R^2           = 0.9459191563889301


In [29]:
regmodel = 'regmodel.sav'
pickle.dump(lin_mod, open(regmodel, 'wb'))