## Evaluating Regression - Loss / Cost / Error

- MAE - Mean Absolute Error
    - won't highlight large errors - (outliers)
- MSE - Mean Squared Error
    - for large errors
- RMSE - Root Mean Squared Error

---

# Test / Train

In [26]:
import numpy as np
# %pip install scikit-learn

In [20]:
x = np.arange(10).reshape(5,2)
x

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

In [21]:
y = range(5)
list(y)

[0, 1, 2, 3, 4]

### Importing sklearn 

In [45]:
from sklearn.model_selection import train_test_split # to split data into train and test
from sklearn.linear_model import LinearRegression # for models and linear regression
from sklearn import metrics
import pandas as pd

In [23]:
x_train, x_test, y_train, y_test = train_test_split( x , y )

In [29]:
x_train # this will output random shuffled numbers

array([[6, 7],
       [8, 9],
       [0, 1]])

In [30]:
y_train

[3, 4, 0]

In [41]:
model = LinearRegression()

In [42]:
model.fit(x_train, y_train)

In [48]:
predict = model.predict(x_test)
predict

array([2., 1.])

In [49]:
y_test

[2, 1]

In [51]:
metrics.mean_absolute_error(y_test, predict)

2.220446049250313e-16

# Housing Data 

In [54]:
data = pd.read_csv('./datasets/USA_Housing.csv')
data.describe()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,68583.108984,5.977222,6.987792,3.98133,36163.516039,1232073.0
std,10657.991214,0.991456,1.005833,1.234137,9925.650114,353117.6
min,17796.63119,2.644304,3.236194,2.0,172.610686,15938.66
25%,61480.562388,5.322283,6.29925,3.14,29403.928702,997577.1
50%,68804.286404,5.970429,7.002902,4.05,36199.406689,1232669.0
75%,75783.338666,6.650808,7.665871,4.49,42861.290769,1471210.0
max,107701.748378,9.519088,10.759588,6.5,69621.713378,2469066.0


In [56]:
data['Price'].round()

0       1059034.0
1       1505891.0
2       1058988.0
3       1260617.0
4        630943.0
          ...    
4995    1060194.0
4996    1482618.0
4997    1030730.0
4998    1198657.0
4999    1298950.0
Name: Price, Length: 5000, dtype: float64

In [57]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Avg. Area Income              5000 non-null   float64
 1   Avg. Area House Age           5000 non-null   float64
 2   Avg. Area Number of Rooms     5000 non-null   float64
 3   Avg. Area Number of Bedrooms  5000 non-null   float64
 4   Area Population               5000 non-null   float64
 5   Price                         5000 non-null   float64
 6   Address                       5000 non-null   object 
dtypes: float64(6), object(1)
memory usage: 273.6+ KB


### Dividing the Dataset

In [61]:
x = data[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms', 'Avg. Area Number of Bedrooms', 'Area Population']]
y = data[['Price']]

### Splitting the data

In [63]:
x_train, x_test, y_train, y_test = train_test_split( x , y )

In [73]:
x_train

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population
3651,64617.438159,7.618949,6.948013,4.44,27995.215129
1494,57925.044700,3.214868,6.988818,2.33,43867.844545
874,45493.208426,7.381581,7.266475,3.20,38938.271201
4786,67043.212293,4.876857,7.555253,3.19,62402.742391
3408,65417.891826,6.222731,7.701180,5.25,39446.674858
...,...,...,...,...,...
2589,76808.344277,7.422467,6.739909,4.21,31129.642484
4856,60850.702061,3.690574,7.415056,4.41,56303.305988
4323,69179.700409,5.424774,6.083480,3.25,36323.694451
3311,75766.124332,6.405793,8.574791,5.23,30776.526728


### Training

In [66]:
model = LinearRegression()

In [67]:
model.fit(x_train, y_train)

### Predicting

In [82]:
predict = model.predict(x_test)
predict.round(2)

array([[ 945716.69],
       [1518356.3 ],
       [1372718.35],
       ...,
       [1232417.92],
       [1476535.91],
       [1472941.59]])

###  MAE

In [83]:
metrics.mean_absolute_error(y_test, predict)

83987.29494980416

###  MSE

In [84]:
mse = metrics.mean_squared_error(y_test, predict)
mse

10875692549.905588

###  RMSE

In [85]:
rmse = np.sqrt(mse)
rmse

104286.5885428495

### Model Evaluation

In [86]:
model.intercept_

array([-2628924.45297443])

In [87]:
model.coef_

array([[2.15298652e+01, 1.65730808e+05, 1.20157414e+05, 4.13402112e+02,
        1.52486639e+01]])

---

# Ecommerce Data - Task

In [89]:
data = pd.read_csv('./datasets/Ecommerce Customers')
data.describe()

Unnamed: 0,Avg. Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent
count,500.0,500.0,500.0,500.0,500.0
mean,33.053194,12.052488,37.060445,3.533462,499.314038
std,0.992563,0.994216,1.010489,0.999278,79.314782
min,29.532429,8.508152,33.913847,0.269901,256.670582
25%,32.341822,11.388153,36.349257,2.93045,445.038277
50%,33.082008,11.983231,37.069367,3.533975,498.887875
75%,33.711985,12.75385,37.716432,4.126502,549.313828
max,36.139662,15.126994,40.005182,6.922689,765.518462


In [93]:
data.columns

Index(['Email', 'Address', 'Avatar', 'Avg. Session Length', 'Time on App',
       'Time on Website', 'Length of Membership', 'Yearly Amount Spent'],
      dtype='object')

### Dividing the Dataset

In [94]:
x = data[['Avg. Session Length', 'Time on App',
       'Time on Website', 'Length of Membership']]
y = data[['Yearly Amount Spent']]

### Splitting the data

In [95]:
x_train, x_test, y_train, y_test = train_test_split( x , y )

In [96]:
x_train

Unnamed: 0,Avg. Session Length,Time on App,Time on Website,Length of Membership
230,32.351478,13.105159,35.574842,3.641497
436,35.433165,11.912210,36.089644,4.000964
43,32.893981,11.529878,36.888086,4.643259
133,32.455176,12.759169,36.599112,4.131277
143,32.115119,11.919242,39.294043,1.443515
...,...,...,...,...
179,33.075703,12.319845,37.819155,3.442799
32,32.175501,13.387492,35.694175,4.343063
401,33.247322,11.956426,36.517346,3.451751
97,34.183821,13.349913,37.827394,4.252006


### Training

In [97]:
model = LinearRegression()

In [98]:
model.fit(x_train, y_train)

### Predicting

In [99]:
predict = model.predict(x_test)
predict.round(2)

array([[461.15],
       [614.96],
       [524.77],
       [412.5 ],
       [702.53],
       [615.59],
       [542.82],
       [416.55],
       [449.75],
       [518.02],
       [537.63],
       [497.05],
       [305.66],
       [478.18],
       [448.71],
       [444.56],
       [437.54],
       [432.16],
       [501.59],
       [576.81],
       [513.03],
       [389.35],
       [574.91],
       [280.  ],
       [501.74],
       [313.82],
       [480.27],
       [585.11],
       [412.92],
       [566.61],
       [515.21],
       [542.75],
       [471.71],
       [467.9 ],
       [563.8 ],
       [420.96],
       [520.23],
       [557.32],
       [587.91],
       [481.25],
       [468.44],
       [608.96],
       [398.91],
       [506.7 ],
       [561.83],
       [525.41],
       [445.26],
       [484.37],
       [556.5 ],
       [420.85],
       [499.07],
       [441.85],
       [396.72],
       [512.08],
       [393.98],
       [593.75],
       [482.27],
       [395.82],
       [516.03

In [100]:
metrics.mean_absolute_error(y_test, predict)

8.218442024004839

In [101]:
mse = metrics.mean_squared_error(y_test, predict)
mse

109.82550057376571

In [102]:
rmse = np.sqrt(mse)
rmse

10.479766246141452

### Model Evaluation

In [103]:
model.intercept_

array([-1042.80988957])

In [106]:
model.coef_

array([[25.77917047, 38.47629813,  0.23196372, 61.51211187]])

---