<a href="https://colab.research.google.com/github/isam007/AssetManagementSystem_MVC/blob/master/PM5_Team6_W21_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set-Up and Data Preparation

In [None]:
import numpy as np # for numeric computation
import pandas as pd # for handling data in table format
import requests # for retrieving web addresses
import io # for storing data
import plotly.express as px # for visualization
import plotly.graph_objects as go #for time series
import matplotlib.pyplot as plt


print("Loading Data ...")
# A more robust data storage solution
p1g_data = 'https://codytischler.com/exportdata/Plant_1_Generation_Data.csv'
p1w_data = 'https://codytischler.com/exportdata/Plant_1_Weather_Sensor_Data.csv'
p2g_data = 'https://codytischler.com/exportdata/Plant_2_Generation_Data.csv'
p2w_data = 'https://codytischler.com/exportdata/Plant_2_Weather_Sensor_Data.csv'

requests.get(p1g_data)
df_p1g = pd.read_csv(p1g_data)
df_p1g.name = 'Plant 1 Power Generation Dataframe'
print(df_p1g.name+' loaded successfully')

requests.get(p1w_data)
df_p1w = pd.read_csv(p1w_data)
df_p1w.name = 'Plant 1 Weather Dataframe'
print(df_p1w.name+' loaded successfully')

requests.get(p2g_data)
df_p2g = pd.read_csv(p2g_data)
df_p2g.name = 'Plant 2 Power Generation Dataframe'
print(df_p2g.name+' loaded successfully')

requests.get(p2w_data)
df_p2w = pd.read_csv(p2w_data)
df_p2w.name = 'Plant 2 Weather Dataframe'
print(df_p2w.name+' loaded successfully')

# create a list of the data frames for easy looping
data_frames = [df_p1g, df_p2g, df_p1w, df_p2w]
print("All data has been loaded")

# correct the data types
print("Changing feature data types...")
categorical_types = ['PLANT_ID', 'SOURCE_KEY']
for df in data_frames:
  for c in categorical_types:
    df[c] = df[c].astype(pd.CategoricalDtype(categories=df[c].unique()))
c = 'DATE_TIME'
for df in data_frames:
  df[c] = pd.to_datetime(df[c], infer_datetime_format=True)
print("Feature Data Types are changed")
for df in data_frames:  
  df['MONTH_DAY'] = df['DATE_TIME'].dt.strftime('%m-%d')
  df = df[['MONTH_DAY'] + df.columns[:-1].tolist()].copy()

# Clean Plant 1 Data
print("Cleaning Plant 1 Data ...")
df_p1g_clean = df_p1g.copy()
df_p1g_clean = df_p1g_clean.set_index('DATE_TIME')
df_p1g_clean = df_p1g_clean.between_time('06:00','19:00') 
df_p1g_clean.drop(columns = ['PLANT_ID', 'AC_POWER', 'TOTAL_YIELD', 'DAILY_YIELD'], inplace=True )
df_p1g_clean = df_p1g_clean.loc[(df_p1g_clean.index.day != 26) & 
                              (df_p1g_clean.index.day != 2) & 
                              (df_p1g_clean.index.day != 3) &
                              (df_p1g_clean.index.day != 7)]
df_p1g_clean.drop(pd.Timestamp('2020-05-19 11:30:00')) # This line is fine for dropping specific timestamps
print("Plant 1 Data Cleaned")

# Clean Plant 2 Data
print("Cleaning Plant 2 Data ... ")
df_p2g_clean = df_p2g.copy()
df_p2g_clean = df_p2g_clean.set_index('DATE_TIME')
df_p2g_clean = df_p2g_clean.between_time('06:00','19:00') 
df_p2g_clean.drop(columns = ['PLANT_ID', 'AC_POWER', 'TOTAL_YIELD', 'DAILY_YIELD'], inplace=True )
## Remove 8 of 22 inverters
df_p2g_clean = df_p2g_clean[(df_p2g_clean.SOURCE_KEY != '4UPUqMRk7TRMgml') & 
             (df_p2g_clean.SOURCE_KEY != 'IQ2d7wF4YD8zU1Q') &
             (df_p2g_clean.SOURCE_KEY != 'xMbIugepa2P7lBB') &
             (df_p2g_clean.SOURCE_KEY != 'mqwcsP2rE7J0TFp') & 
             (df_p2g_clean.SOURCE_KEY != 'NgDl19wMapZy17u') &
             (df_p2g_clean.SOURCE_KEY != 'xoJJ8DcxJEcupym') &
             (df_p2g_clean.SOURCE_KEY != 'PeE6FRyGXUgsRhN') &
             (df_p2g_clean.SOURCE_KEY != 'Quc1TzYxW2pYoWX')]
print("Plant 1 Data Cleaned")


# Join Plant 1 Weather and Power Data
print("Joining Plant 1 Data ...")
## concatenate the data sets
df_p1w = df_p1w.set_index('DATE_TIME')
df_p1w = df_p1w.drop(columns = ['PLANT_ID','SOURCE_KEY'])
df_p1wg = pd.concat([df_p1w, df_p1g_clean], axis=0)
## reformat datatime
format = '%Y-%m-%d %H:%M:%S'
df_p1wg.index = pd.to_datetime(df_p1wg.index, format=format)
df_p1wg = df_p1wg.set_index(pd.DatetimeIndex(df_p1wg.index))
df_p1wg = df_p1wg.sort_index()
## interpolate weather data into power data
df_p1wg["AMBIENT_TEMPERATURE"].interpolate(method='index', inplace=True)
df_p1wg["MODULE_TEMPERATURE"].interpolate(method='index', inplace=True)
df_p1wg["IRRADIATION"].interpolate(method='index', inplace=True)
## drop Nans
df_p1wg = df_p1wg.dropna()
print("Plant 1 Data Joined")

# Join Plant 2 Weather and Power Data
print("Joining Plant 2 Data ...")
## concatenate the data sets
df_p2w = df_p2w.set_index('DATE_TIME')
df_p2w = df_p2w.drop(columns = ['PLANT_ID','SOURCE_KEY'])
df_p2wg = pd.concat([df_p2w, df_p2g_clean], axis=0)
## reformat datatime
format = '%Y-%m-%d %H:%M:%S'
df_p2wg.index = pd.to_datetime(df_p2wg.index, format=format)
df_p2wg = df_p2wg.set_index(pd.DatetimeIndex(df_p2wg.index))
df_p2wg = df_p2wg.sort_index()
## interpolate weather data into power data
df_p2wg["AMBIENT_TEMPERATURE"].interpolate(method='index', inplace=True)
df_p2wg["MODULE_TEMPERATURE"].interpolate(method='index', inplace=True)
df_p2wg["IRRADIATION"].interpolate(method='index', inplace=True)
## drop Nans
df_p2wg = df_p2wg.dropna()
print("Plant 2 Data Joined")

Loading Data ...
Plant 1 Power Generation Dataframe loaded successfully
Plant 1 Weather Dataframe loaded successfully
Plant 2 Power Generation Dataframe loaded successfully
Plant 2 Weather Dataframe loaded successfully
All data has been loaded
Changing feature data types...
Feature Data Types are changed
Cleaning Plant 1 Data ...
Plant 1 Data Cleaned
Cleaning Plant 2 Data ... 
Plant 1 Data Cleaned
Joining Plant 1 Data ...
Plant 1 Data Joined
Joining Plant 2 Data ...
Plant 2 Data Joined


# Feature Engineering

In [None]:
plant = "1"
print ("Calculating Engineered Features for Plant "+plant+" ...")
df_copy = df_p1wg.copy()
df_copy.reset_index(inplace=True)
#make a new column to hold the sum of the DC_POWER
df_copy['TOTAL_DC_POWER'] = 0.0
df_copy['OPERATING_INVERTERS'] = 0.0

# get number of rows
n_rows = df_copy.shape[0]

#create a new dataframe to save redundant calculations
df_check = pd.DataFrame(columns= ["DATE_TIME", "TOTAL_DC_POWER", "OPERATING_INVERTERS"])
last_print = -1
# loop through all rows.
for ii in range(0,n_rows):
  percent_complete = round(100*ii/n_rows)
  if (percent_complete % 10 == 0):
    if percent_complete != last_print:
      print(str(percent_complete)+"% Complete")
      last_print = percent_complete
  ii_date_time = df_copy["DATE_TIME"].loc[ii] #get the date time for the current row ii
  check_mask = (df_check["DATE_TIME"] == ii_date_time) # make a mask to see if we have already encountered this datetime before
  already_calculated = np.dot(check_mask,check_mask) #use this mask to return a value of 1 or zero that we can put in an if statement
  if (already_calculated > 0.5): # use 0.5 not 1 or zero because floating points are evil i.e 0.9999999 != 1
    # well then we have already calulated this date_time value so we just get it drom df_check and log it in
    df_copy.loc[ii,"TOTAL_DC_POWER"] = np.dot(check_mask,df_check["TOTAL_DC_POWER"])
    df_copy.loc[ii,"OPERATING_INVERTERS"] = np.dot(check_mask,df_check["OPERATING_INVERTERS"])
  else:
    # we have not calculated this before
    # get a boolean mask of all the rows which have the same date time
    mask = (df_copy["DATE_TIME"] == ii_date_time).astype('float') # This gives us all the inverters operating at this datetime which we need to sum to get the total power
    total_power = np.dot(mask,df_copy['DC_POWER']) # take the dot product of the mask with all the rows for DC_POWER which will give us the sum 
    df_copy.loc[ii,"TOTAL_DC_POWER"] = total_power # assign it to the row ii
    total_inverters = np.dot(mask,mask) # take a dot product of the mask with itselft to count all the operating inverters
    df_copy.loc[ii,"OPERATING_INVERTERS"] = total_inverters # assign it to the row ii
    #now we shall add this information to our df_check so that we can use it if we need to
    temp = pd.DataFrame([[ii_date_time, total_power, total_inverters]], columns = ["DATE_TIME", "TOTAL_DC_POWER", "OPERATING_INVERTERS"])
    df_check = df_check.append(temp)

# now each each inverter at the same date_time will have the 'TOTAL_DC_POWER'
# drop the source keys
df_copy.drop("SOURCE_KEY",axis=1, inplace=True)
df_copy.drop("DC_POWER",axis=1, inplace=True)
df_copy.drop("MONTH_DAY",axis=1, inplace=True)

# Now each date_time has many duplicates. drop all the duplicates
df_copy = df_copy.drop_duplicates().copy()
#now df_power_only has only the sum powers
# calculate avg power per inverter
df_copy["DC_POWER_PER_INVERTER"] = df_copy["TOTAL_DC_POWER"] / df_copy["OPERATING_INVERTERS"]
df_copy.drop("OPERATING_INVERTERS",axis=1, inplace=True)
df_copy.drop("TOTAL_DC_POWER",axis=1, inplace=True)

df_p1 = df_copy.copy()

print("Finished calculating new features for Plant "+plant)


plant = "2"
print ("Calculating Engineered Features for Plant "+plant+" ...")
df_copy = df_p2wg.copy()
df_copy.reset_index(inplace=True)
#make a new column to hold the sum of the DC_POWER
df_copy['TOTAL_DC_POWER'] = 0.0
df_copy['OPERATING_INVERTERS'] = 0.0

# get number of rows
n_rows = df_copy.shape[0]

#create a new dataframe to save redundant calculations
df_check = pd.DataFrame(columns= ["DATE_TIME", "TOTAL_DC_POWER", "OPERATING_INVERTERS"])
last_print = -1
# loop through all rows.
for ii in range(0,n_rows):
  percent_complete = round(100*ii/n_rows)
  if (percent_complete % 10 == 0):
    if percent_complete != last_print:
      print(str(percent_complete)+"% Complete")
      last_print = percent_complete
  ii_date_time = df_copy["DATE_TIME"].loc[ii] #get the date time for the current row ii
  check_mask = (df_check["DATE_TIME"] == ii_date_time) # make a mask to see if we have already encountered this datetime before
  already_calculated = np.dot(check_mask,check_mask) #use this mask to return a value of 1 or zero that we can put in an if statement
  if (already_calculated > 0.5): # use 0.5 not 1 or zero because floating points are evil i.e 0.9999999 != 1
    # well then we have already calulated this date_time value so we just get it drom df_check and log it in
    df_copy.loc[ii,"TOTAL_DC_POWER"] = np.dot(check_mask,df_check["TOTAL_DC_POWER"])
    df_copy.loc[ii,"OPERATING_INVERTERS"] = np.dot(check_mask,df_check["OPERATING_INVERTERS"])
  else:
    # we have not calculated this before
    # get a boolean mask of all the rows which have the same date time
    mask = (df_copy["DATE_TIME"] == ii_date_time).astype('float') # This gives us all the inverters operating at this datetime which we need to sum to get the total power
    total_power = np.dot(mask,df_copy['DC_POWER']) # take the dot product of the mask with all the rows for DC_POWER which will give us the sum 
    df_copy.loc[ii,"TOTAL_DC_POWER"] = total_power # assign it to the row ii
    total_inverters = np.dot(mask,mask) # take a dot product of the mask with itselft to count all the operating inverters
    df_copy.loc[ii,"OPERATING_INVERTERS"] = total_inverters # assign it to the row ii
    #now we shall add this information to our df_check so that we can use it if we need to
    temp = pd.DataFrame([[ii_date_time, total_power, total_inverters]], columns = ["DATE_TIME", "TOTAL_DC_POWER", "OPERATING_INVERTERS"])
    df_check = df_check.append(temp)

# now each each inverter at the same date_time will have the 'TOTAL_DC_POWER'
# drop the source keys
df_copy.drop("SOURCE_KEY",axis=1, inplace=True)
df_copy.drop("DC_POWER",axis=1, inplace=True)
df_copy.drop("MONTH_DAY",axis=1, inplace=True)

# Now each date_time has many duplicates. drop all the duplicates
df_copy = df_copy.drop_duplicates().copy()
#now df_power_only has only the sum powers
# calculate avg power per inverter
df_copy["DC_POWER_PER_INVERTER"] = df_copy["TOTAL_DC_POWER"] / df_copy["OPERATING_INVERTERS"]
df_copy.drop("OPERATING_INVERTERS",axis=1, inplace=True)
df_copy.drop("TOTAL_DC_POWER",axis=1, inplace=True)

df_p2 = df_copy.copy()

print("Finished calculating new features for Plant "+plant)


Calculating Engineered Features for Plant 1 ...
0% Complete
10% Complete
20% Complete
30% Complete
40% Complete
50% Complete
60% Complete
70% Complete
80% Complete
90% Complete
100% Complete
Finished calculating new features for Plant 1
Calculating Engineered Features for Plant 2 ...
0% Complete
10% Complete
20% Complete
30% Complete
40% Complete
50% Complete
60% Complete
70% Complete
80% Complete
90% Complete
100% Complete
Finished calculating new features for Plant 2


In [None]:
display(df_p1)

Unnamed: 0,DATE_TIME,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,DC_POWER_PER_INVERTER
0,2020-05-15 06:00:00,24.088446,22.206757,0.005887,44.864229
21,2020-05-15 06:15:00,24.011635,22.353459,0.022282,292.481009
42,2020-05-15 06:30:00,23.976731,22.893282,0.049410,694.143398
64,2020-05-15 06:45:00,24.218990,24.442444,0.095394,1301.208604
86,2020-05-15 07:00:00,24.537398,27.185653,0.141940,1879.359740
...,...,...,...,...,...
33874,2020-06-17 18:00:00,24.130349,25.080925,0.041940,605.090909
33896,2020-06-17 18:15:00,24.038157,24.068250,0.023446,338.664773
33918,2020-06-17 18:30:00,23.840239,22.968658,0.007007,78.116883
33940,2020-06-17 18:45:00,23.583049,22.460372,0.000039,0.000000


#PM5. Project Assignment: Linear Regression

The objective of this assignment is for you to perform a complete implementation of linear
regression using your group’s chosen dataset.

## Prior to QuAM Building:




In [None]:
df = df_p1.copy()
display(df)

Unnamed: 0,DATE_TIME,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,DC_POWER_PER_INVERTER
0,2020-05-15 06:00:00,24.088446,22.206757,0.005887,44.864229
21,2020-05-15 06:15:00,24.011635,22.353459,0.022282,292.481009
42,2020-05-15 06:30:00,23.976731,22.893282,0.049410,694.143398
64,2020-05-15 06:45:00,24.218990,24.442444,0.095394,1301.208604
86,2020-05-15 07:00:00,24.537398,27.185653,0.141940,1879.359740
...,...,...,...,...,...
33874,2020-06-17 18:00:00,24.130349,25.080925,0.041940,605.090909
33896,2020-06-17 18:15:00,24.038157,24.068250,0.023446,338.664773
33918,2020-06-17 18:30:00,23.840239,22.968658,0.007007,78.116883
33940,2020-06-17 18:45:00,23.583049,22.460372,0.000039,0.000000


In [None]:
#Convert Date_Time into numeric values

df['NUMERIC_TIME'] = 0.0
df['NUMERIC_TIME'] =  (df['DATE_TIME'] - pd.to_datetime(df['DATE_TIME']).dt.floor('d')).astype('timedelta64[m]').astype(float)
df = df[['NUMERIC_TIME'] + df.columns[:-1].tolist()].copy()


In [None]:
#These variables are to be used for the plot labels later
inputs = df.drop('DC_POWER_PER_INVERTER',axis='columns')
feature_names = inputs.columns.tolist()
target = df['DC_POWER_PER_INVERTER']
target_name = 'DCPower'

In [None]:
df.drop('DATE_TIME',axis=1, inplace=True)
df.drop('MODULE_TEMPERATURE',axis=1, inplace=True)
#df.drop('DC_POWER_PER_INVERTER',axis=1, inplace=True)

In [None]:
display(df)

Unnamed: 0,NUMERIC_TIME,AMBIENT_TEMPERATURE,IRRADIATION,DC_POWER_PER_INVERTER
0,360.0,24.088446,0.005887,44.864229
21,375.0,24.011635,0.022282,292.481009
42,390.0,23.976731,0.049410,694.143398
64,405.0,24.218990,0.095394,1301.208604
86,420.0,24.537398,0.141940,1879.359740
...,...,...,...,...
33874,1080.0,24.130349,0.041940,605.090909
33896,1095.0,24.038157,0.023446,338.664773
33918,1110.0,23.840239,0.007007,78.116883
33940,1125.0,23.583049,0.000039,0.000000


In [None]:
target

0          44.864229
21        292.481009
42        694.143398
64       1301.208604
86       1879.359740
            ...     
33874     605.090909
33896     338.664773
33918      78.116883
33940       0.000000
33962       0.000000
Name: DC_POWER_PER_INVERTER, Length: 1558, dtype: float64


## Splitting data into testing and training.

In [None]:
#Split Dataset into training and test data
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, random_state=0)

X_train = df_train.iloc[:, :-1].values #all but last column
y_train = df_train.iloc[:, -1].values  #last column

X_test = df_test.iloc[:, :-1].values
y_test = df_test.iloc[:, -1].values

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [None]:
display(X_train)

array([[8.40000000e+02, 2.99116638e+01, 6.44802365e-01],
       [1.05000000e+03, 2.59601472e+01, 1.01433166e-01],
       [9.30000000e+02, 3.20721654e+01, 5.63544357e-01],
       ...,
       [7.65000000e+02, 2.76670817e+01, 3.84930638e-01],
       [4.35000000e+02, 2.47110291e+01, 1.54212341e-01],
       [7.35000000e+02, 2.99806071e+01, 7.11932499e-01]])

In [None]:
y_train

array([ 9034.46103895,  1488.56590909,  7443.25487009, ...,
        5811.92613636,  1957.140422  , 10191.84415591])

## Linear Regression QuAM





###sklearn.linear_model.LinearRegression

1. Use scikit-learn’s sklearn.linear_model.LinearRegression to implement alinear surface for your dataset. 


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [None]:
linear_regressor = LinearRegression()

linear_regressor.fit(X, y)

print("b : ", linear_regressor.intercept_)
print("w :", linear_regressor.coef_)

yhat_train = linear_regressor.predict(X_train)
yhat_test = linear_regressor.predict(X_test)

print('MAE:', metrics.mean_absolute_error(y_test, yhat_test))
print('MSE:', metrics.mean_squared_error(y_test, yhat_test))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, yhat_test)))
print('R2:', metrics.r2_score(y_test, yhat_test))

NameError: ignored

Let's visualize the surface:


In [None]:
#Lets figure out the shape of our data
m = X.shape[0]
n = X.shape[1]

X_b = np.c_[np.ones((m, 1)), X]
n_b = n + 1

#Now, we can implement the gradient descent algorithm. Let's set values for constants involved in GD:

tolerance = 0.01 # the desired level of precision to stop at
alpha = 0.1 # step size (or learning rate)

#Let's take a random starting point:
np.random.seed(42) # for experiment control purposes
w_b_0 = np.random.normal(0.0, 1.0, (n_b, 1))

#Let's use that to initialize our weights:
w_b = w_b_0
w_b

#Let's calculate our error and loss with this random starting point. We have to first predict the label for the points:
yhat = X_b @ w_b
#Then we can use that prediction to calculate the difference between  y^  and  y  and the loss:
y_diff = yhat - y
loss = (y_diff.T @ y_diff)[0, 0] / m

#We also need to define a loss before the current step in order to have a valid loop condition for the main potimization loop
old_loss = np.infty

#Main Loop
while np.abs(old_loss - loss) > tolerance: # stops when the change in losses in
                                           # two consecutive steps is less than
                                           # the predefined tolerance.
  dJdw = 2 * (X_b.T @ y_diff) / m # the gradient, using the formula found
  #w_b -= alpha * dJdw # gradient descent
  old_loss = loss # updating old loss
  yhat = X_b @ w_b # updating the prediction
  y_diff = yhat - y
  loss = (y_diff.T @ y_diff)[0, 0] / m # new loss

#Let's visualize the linear surface:

x_vis_1_range = np.linspace(-1.0, 1.0, 100)
x_vis_2_range = np.linspace(-1.0, 1.0, 100)
#x_vis_3_range = np.linspace(-1.0, 1.0, 100) #IS added to fix dimension mismatch

XX_vis_1, XX_vis_2 = np.meshgrid(x_vis_1_range, x_vis_2_range)
#XX_vis_1, XX_vis_2, XX_vis_3 = np.meshgrid(x_vis_1_range, x_vis_2_range, x_vis_3_range) #IS added to fix dimension mismatch
X_vis = np.c_[XX_vis_1.flatten(), XX_vis_2.flatten()]
#X_vis = np.c_[XX_vis_1.flatten(), XX_vis_2.flatten(), XX_vis_3.flatten()] #IS added to fix dimension mismatch
grid_shape = XX_vis_2.shape
column_shape = XX_vis_1.flatten().shape

X_vis_b = np.c_[np.ones(column_shape), X_vis]

In [None]:
grid_shape

(100, 100)

In [None]:
X_vis


array([[-1.        , -1.        ],
       [-0.97979798, -1.        ],
       [-0.95959596, -1.        ],
       ...,
       [ 0.95959596,  1.        ],
       [ 0.97979798,  1.        ],
       [ 1.        ,  1.        ]])

In [None]:

#_______________________
#Rakesh's code below

y_vis = linear_regressor.predict(X_vis)
YY_vis = y_vis.reshape(grid_shape)



fig = go.Figure()

fig.add_trace(go.Scatter3d(x=X_b[:, 1],
                          y=X_b[:, 2],
                          z=y[:, 0],
                          mode='markers',
                          marker=dict(color=(y_diff ** 2)[:, 0],
                                      colorscale='Teal',
                                      opacity=0.75
                                     )
                         )
            )
fig.add_trace(go.Surface(x=x_vis_2_range,
                        y=x_vis_3_range,
                        z=YY_vis, 
                        showscale=False
                       )
            )

fig.update_layout(scene=dict(xaxis_title="x_1",
                            yaxis_title="x_2",
                            zaxis_title="y"
                           ),
                 showlegend=False
                )

fig.show()

###sklearn.linear_model.Ridge

2. Use scikit-learn’s sklearn.linear_model.Ridge to implement linear least squares with L2 regularization for your dataset using the default parameters.


In [None]:
from sklearn.linear_model import Ridge
ridge_regressor = Ridge()
ridge_regressor.fit(X, y)

print("b :", ridge_regressor.intercept_)  #IS: got rid of 0
print("w :", ridge_regressor.coef_)

from sklearn import metrics
yhat_train = ridge_regressor.predict(X_train)
yhat_test = ridge_regressor.predict(X_test)
print('MAE:', metrics.mean_absolute_error(y_test, yhat_test))
print('MSE:', metrics.mean_squared_error(y_test, yhat_test))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, yhat_test)))
print('R2:', metrics.r2_score(y_test, yhat_test))

b : -217.96471419132013
w : [-1.74138957e-01  2.48566994e+01  1.27350894e+04]
MAE: 302.8342670067443
MSE: 193896.24348369596
RMSE: 440.33651164046796
R2: 0.9865718101107295


3. Use scikit-learn’s sklearn.linear_model.Ridge to implement linear least squares with L2 regularization for your dataset tweak the value of alpha and report your findings.

In [None]:
from sklearn.linear_model import Ridge

In [None]:
lambda_ridge = 0.1
ridge_regressor = Ridge(alpha=lambda_ridge)
ridge_regressor.fit(X, y)

print("b :", ridge_regressor.intercept_)  #IS: got rid of 0
print("w :", ridge_regressor.coef_)

b : -1.4452683225899818
w : [-3.78220490e-02  1.05703659e+01  1.29182949e+04]


In [None]:
from sklearn import metrics
yhat_train = ridge_regressor.predict(X_train)
yhat_test = ridge_regressor.predict(X_test)
print('MAE:', metrics.mean_absolute_error(y_test, yhat_test))
print('MSE:', metrics.mean_squared_error(y_test, yhat_test))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, yhat_test)))
print('R2:', metrics.r2_score(y_test, yhat_test))

MAE: 295.36534091398653
MSE: 195055.36485626874
RMSE: 441.6507272226196
R2: 0.9864915357247178


Let's visualize that as well:


In [None]:
y_vis = ridge_regressor.predict(X_vis)
YY_vis = y_vis.reshape(grid_shape)

fig = go.Figure()

fig.add_trace(go.Scatter3d(x=X_b[:, 1],
                          y=X_b[:, 2],
                          z=y[:, 0],
                          mode='markers',
                          marker=dict(color=(y_diff ** 2)[:, 0],
                                      colorscale='Teal',
                                      opacity=0.75
                                     )
                         )
            )
fig.add_trace(go.Surface(x=x_vis_1_range,
                        y=x_vis_2_range,
                        z=YY_vis, 
                        showscale=False
                       )
            )

fig.update_layout(scene=dict(xaxis_title="x_1",
                            yaxis_title="x_2",
                            zaxis_title="y"
                           ),
                 showlegend=False
                )

fig.show()

###sklearn.linear_model.Lasso

4. Use scikit-learn’s sklearn.linear_model.Lasso to implement linear least squares with L1 regularization for your dataset using the default parameters.

In [None]:
from sklearn.linear_model import Lasso

In [None]:
lasso_regressor = Lasso()
lasso_regressor.fit(X, y)

print("b :", lasso_regressor.intercept_) #IS: removed [0] - invalid index of non-iterable variable
print("w :", lasso_regressor.coef_)

b : -1.3202312314415394
w : [-3.86678045e-02  1.06239062e+01  1.29159263e+04]


In [None]:
from sklearn import metrics
yhat_train = lasso_regressor.predict(X_train)
yhat_test = lasso_regressor.predict(X_test)
print('MAE:', metrics.mean_absolute_error(y_test, yhat_test))
print('MSE:', metrics.mean_squared_error(y_test, yhat_test))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, yhat_test)))
print('R2:', metrics.r2_score(y_test, yhat_test))

MAE: 295.50340835020745
MSE: 195028.26090745203
RMSE: 441.62004133355634
R2: 0.9864934127954899


5. Use scikit-learn’s sklearn.linear_model.Lasso to implement linear least squares with L1 regularization for your dataset tweak the value of alpha and report your findings.

In [None]:
from sklearn.linear_model import Lasso

In [None]:
lambda_lasso = 0.1

lasso_regressor = Lasso(alpha=lambda_lasso)
lasso_regressor.fit(X, y)

print("b :", lasso_regressor.intercept_) #IS: removed [0] - invalid index of non-iterable variable
print("w :", lasso_regressor.coef_)

b : 20.5708839250683
w : [-2.40543560e-02  9.12393926e+00  1.29366738e+04]


In [None]:
from sklearn import metrics
yhat_train = lasso_regressor.predict(X_train)
yhat_test = lasso_regressor.predict(X_test)
print('MAE:', metrics.mean_absolute_error(y_test, yhat_test))
print('MSE:', metrics.mean_squared_error(y_test, yhat_test))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, yhat_test)))
print('R2:', metrics.r2_score(y_test, yhat_test))

MAE: 294.8065475955234
MSE: 195315.54904429673
RMSE: 441.94518782796666
R2: 0.9864735167955202


###sklearn.linear_model.GridSearchCV

Grid Search Review 


In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
lasso_search = GridSearchCV(estimator=Lasso(), \
                            cv=5, \
                            param_grid=dict(alpha=np.logspace(-3, 0, 50)), \
                            scoring='r2')

In [None]:
lasso_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': array([0.001     , 0.0011514 , 0.00132571, 0.00152642, 0.00175751,
       0.00202359, 0.00232995, 0.00...
       0.01676833, 0.01930698, 0.02222996, 0.02559548, 0.02947052,
       0.03393222, 0.0390694 , 0.04498433, 0.05179475, 0.05963623,
       0.06866488, 0.07906043, 0.09102982, 0.10481131, 0.12067926,
       0.13894955, 0.15998587, 0.184207  , 0.21209509, 0.24420531,
       0.28117687, 0.32374575, 0.37275937, 0.42919343, 0.49417134,
       0.5689866 , 0.65512856, 0.75431201, 0.86851137, 1.        ])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
   

In [None]:
from sklearn import metrics
yhat_train = lasso_search.predict(X_train)
yhat_test = lasso_search.predict(X_test)
print('MAE:', metrics.mean_absolute_error(y_test, yhat_test))
print('MSE:', metrics.mean_squared_error(y_test, yhat_test))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, yhat_test)))
print('R2:', metrics.r2_score(y_test, yhat_test))

MAE: 294.8106947902112
MSE: 197293.95948674114
RMSE: 444.1778466861457
R2: 0.986336502944077


In [None]:
lasso_search.best_estimator_

Lasso(alpha=0.18420699693267165, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

## Evaluating the Algorithm

1. To evaluate the performance of your regressors, the commonly used metrics are mean absolute error, mean squared error, root mean squared error and r2 score.The Scikit-Learn library contains functions that can help calculate these values for us. To do so, use this code from the metrics package:

from sklearn import metrics <br>
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred)) <br>
print('Mean Squared Error:',metrics.mean_squared_error(y_test,y_pred)) <br>
print('Root Mean Squared Error:', <br>
np.sqrt(metrics.mean_squared_error(y_test, y_pred))) <br>
print('Coefficient of determination: %.2f' % r2_score (y_test,y_pred)) <br>

In [None]:
from sklearn import metrics

###RMSE


In [None]:
rmse_train = metrics.mean_squared_error(y, yhat)
print("RMSE on training data:", rmse_train)
rmse_test = metrics.mean_squared_error(y_test, yhat_test)
print("RMSE on test data:", rmse_test)

RMSE on training data: 46283860.73878691
RMSE on test data: 47314579.78652875


###MAE

In [None]:
mae_train = metrics.mean_absolute_error(y, yhat)
print("MAE on training data:", mae_train)
mae_test = metrics.mean_absolute_error(y_test, yhat_test)
print("MAE on test data:", mae_test)

MAE on training data: 5606.055928611738
MAE on test data: 5736.57659920749


###R2  score

In [None]:
r_squared_train = metrics.r2_score(y, yhat)
print("R-squared on training data:", r_squared_train)
r_squared_test = metrics.r2_score(y_test, yhat_test)
print("R-squared on test data:", r_squared_test)

R-squared on training data: -2.1059494071232314
R-squared on test data: -2.276748174638943


## QuAM Comparison

Compare your Regressors: provide a comparison table of the performance for all of the QuAMs you’ve built using the different losses. This is a practice you should get in the habit of doing as a technician to quickly evaluate which QuAMs work well and which don’t. It will also serve as a communication tool to summarize to stakeholders what you’ve tried, what worked best, and why.


<table>
<caption><strong><h2>Team 6 QuAM comparison
	<tr>
		<th>Algorithm</th>
		<th>MAE</th>
		<th>MSE</th>
    <th>RMSE</th>
    <th>R2 Score</th>
	</tr>
	<tr>
		<td> sklearn.linear_model.LinearRegression
		<td> 294.73262
		<td> 195349.53508 
    <td> 441.98364
		<td> 0.98647
	</tr>
		<tr>
		<td> sklearn.linear_model.Ridge
		<td> 295.36534
    <td> 195055.36486
    <td> 441.65073
    <td> 0.98650
	</tr>
  	<tr>
		<td> sklearn.linear_model.Lasso
		<td> 294.80655
    <td> 195315.54904
    <td> 441.94519
    <td> 0.98647
	</tr>
  	<tr>
		<td> sklearn.linear_model.GridSearchCV
		<td> 294.81069
    <td> 197293.95949
    <td> 444.17785
    <td> 0.98633
	</tr>

</table>