In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.linear_model
import sklearn.preprocessing
import sklearn.pipeline
import sklearn.metrics
import skillsnetwork
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

df = pd.read_csv("./automobile.csv", header=0)
df = df.replace('?', np.nan)
df = df.dropna()
df['price'] = df['price'].astype(float)
df['peak-rpm'] = df['peak-rpm'].astype(int)
df['horsepower'] = df['horsepower'].astype(int)
# print(df.dtypes)
# print(df.head())

#SECTION - Simple Linear Regression 
"""NOTE - Simple Linear Regression 
Simple Linear Regression is a method to help us understand the relationship between two variables:  
- the predictor/independent variable (X)
- The response/dependent variable (that we want to predict) (Y)

The result of Linear Regression is a linear function that predicts the response (dependent) variable as a function of the predictor (independent) variable.

Linear Function: 
Y = a + bX

- 'a' refers to the intercept of the regression line, in others words: the value of Y when X is 0
- 'b' refers to the slope of the regression line, in others words: the value with which Y changes when X changes by 1 unit
"""
# SECTION - LM
lm = sklearn.linear_model.LinearRegression()


# Create a linear function with 'highway-mpg' as the predictor and 'price' as the response.
X = df[['highway-mpg']]
Y = df['price']

lm.fit(X,Y)

Yhat = lm.predict(X)

#STUB - LM Print Statements
print("Yhat = ", Yhat)
# print(Yhat[0:5])
# print(lm.intercept_)
# print(lm.coef_)
# print("Equation of the line: \nPrice = ", lm1.coef_, "* highway-mpg + ", lm1.intercept_)
#!SECTION

# SECTION - LM1
lm1 = sklearn.linear_model.LinearRegression()
X = df[['engine-size']]
Y = df['price']

lm1.fit(X,Y)

Yhat1 = lm1.predict(X)

#STUB - ML1 Print Statements
# print(Yhat[0:5])
# print("lm1 Y intercept: ", lm1.intercept_)
# print("lm1 coefficients/slope:", lm1.coef_)
# print("Equation of the line: \nPrice = ", lm1.coef_, "* Engine Size + ", lm1.intercept_)
#!SECTION
#!SECTION

#SECTION - Multiple Linear Regression
"""NOTE - Multiple Linear Regression
What if we want to predict care price using more than one variable?

Multiple Linear Regression is very similar to Simple Linear Regression (SLR) but this method is used to explain the relationship between one continuous response (dependent) variable and two or more predictor (independent) variables. Most of the real-world regression models involve multiple predictors.

Equation:
Yhat = a + b1X1 + b2X2 + b3X3 + b4X4...
"""


#SECTION - MLR

mlr = sklearn.linear_model.LinearRegression()

Z = df[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']]
mlr.fit(Z, df['price'])
Yhat = mlr.predict(Z)

#STUB - MLR Example Print Statements
# print(mlr.intercept_)
# print(mlr.coef_)
# print("Equation of the line: \nPrice = ", mlr.coef_[0], "* horsepower + ", mlr.coef_[1], "* curb-weight + ", mlr.coef_[2], "* engine-size + ", mlr.coef_[3], "* highway-mpg + ", mlr.intercept_)
#!SECTION

#SECTION - MLR2

mlr2 = sklearn.linear_model.LinearRegression()

Z = df[['normalized-losses', 'highway-mpg']]
mlr2.fit(Z, df['price'])
Yhat = mlr2.predict(Z)
#STUB - MLR2 Print Statements
# print(mlr2.intercept_)
# print(mlr2.coef_)
# print("Equation of the live given by: \nPrice = ", mlr2.coef_[0], "* normalized-losses + ", mlr2.coef_[1], "* highway-mpg + ", mlr2.intercept_)
#!SECTION
#!SECTION

#SECTION - Model Evaluation Using Visualization
#SECTION - Regression Plot
"""NOTE - Regression Plot
When it comes to simple linear regression, an excellent way to visualize the fit of our model is by using regression plots.

This plot will show a combination of scattered data points (a scatter-plot), as well as the fitted linear regression line going through the data. This will give us a reasonable estimate of the relationship between the two variables, the strength of the correlation, as well as the direction (positive or negative correlation).
"""

# Let's visualize highway-ppg as the potential predictor variable of price:

width = 12
height = 10
plt.figure(figsize=(width, height))
sns.regplot(x="highway-mpg", y="price", data=df)
plt.ylim(0,)
plt.savefig("./Plots/RegPlot/highway-mpgVprice")
plt.clf()

# One thing to keep in mind when looking at a regression plot is to pay attention to howe scattered the data points are around the regression line. This will give you a good indication of the variance of the data and whether a linear model would be the best fit or not. If the data is too far off from the line, this linear model might not be the best model for this data.


# Let's compare this plot to the regression plot of "peak-rpm"

plt.figure(figsize=(width, height))
sns.regplot(x="peak-rpm", y="price", data=df)
plt.ylim(0,)
plt.savefig("./Plots/RegPlot/peak-rpmVprice")
plt.clf()

#STUB - Correlation Print: Highway-mpg, peak-rpm, price
# print(df[['highway-mpg', 'peak-rpm', 'price']].corr())

#!SECTION
#SECTION - Residual Plot
"""NOTE - Residual Plot
A good way to visualize the variance of the data is to use a residual plot.

What is a residual?

The difference between the observed value (y) and the predicted value (Yhat) is called the residual (e). When we look at a regression plot, the residual is the distance from the data point to the fitted regression line.

So what is a residual plot?

A residual plot is a graph that shows the residuals on the vertical y-axis and the independent variable on the horizontal x-axis.

What do we pay attention to when looking at a residual plot?

We look at the spread of the residuals:
- If the points in a residual plot are randomly spread out around the x-axis, then a linear model is appropriate for the data.

Why is that? Randomly spread out residuals means that the variance is constant, and thus the linear model a good fit for this data.
"""

width = 12
height = 10
plt.figure(figsize=(width, height))
sns.residplot(x=df['highway-mpg'], y=df['price'])
plt.savefig("./Plots/ResidPlot/highway-mpgVprice")
plt.clf()

# From this residual plot, we can see that the residuals are not randomly spread around the x-axis, leading us to believe that maybe a non-linear model is more appropriate for this data.
#!SECTION

#SECTION - Visualizing Multiple Linear Regression
"""NOTE - Multiple Linear Regression
How do we visualize a model for Multiple Linear Regression? This gets a bit more complicated because you can't visualize it with regression or residual plot.

One way to look at the fit of the model is by looking at the distribution plot. We can look at the distribution of the fitted values that result from the model and compare it to the distribution of the actual values.

First, let's make a prediction:
"""
#TODO - Fix this:
# sklearn.set_config(transform_output="pandas")
# Z = df[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']]

# Y_hat = lm.predict(df[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])
# plt.figure(figsize=(width, height))


# ax1 = sns.displot(df['price'], hist=False, color="r", label="Actual Value")
# sns.displot(Y_hat, hist=False, color="b", label="Fitted Values", ax=ax1)

# plt.title('Actual vs Fitted Values for Price')
# plt.xlabel('Price (in dollars)')
# plt.ylabel('Proportion of Cars')

# plt.show()
# plt.close()

#!SECTION

#SECTION - Polynomial Regression and Pipelines
"""NOTE - Polynomial Regression

Polynomial regression is a particular case of the general linear regression model or multiple linear regression models.

We get non-linear relationships by squaring or setting higher-order terms of the predictor variables.

There are different orders of polynomial regression:
- Quadratic 2nd Order
Yhat = a + b1X + b2X^2

- Cubic 3rd Order
Yhat = a + b1X + b2X^2 + b3X^3

- Higher-Order
Y = a + b1X + b2X^2 + b3X^3 + ... + bnX^n

We saw earlier that a linear model did not provide the best fit while using "highway-mpg" as the predictor variable. Let's see if we can try fitting a polynomial model to the data instead.

We will use the following function to plot the data:
"""

def PlotPolly(model, independent_variable, dependent_variable, Name):
    x_new = np.linspace(15, 55, 100)
    y_new = model(x_new)

    plt.plot(independent_variable, dependent_variable, '.', x_new, y_new, '-')
    plt.title('Polynomial Fit with Matplotlib for Price ~ Length')
    ax = plt.gca()
    ax.set_facecolor((0.898, 0.898, 0.898))
    fig = plt.gcf()
    plt.xlabel(Name)
    plt.ylabel('Price of Cars')
    
    plt.savefig('./Plots/PolyReg/Polynomial Fit with Matplotlib for Price ~ Length' + Name)
    plt.show()
    plt.close()
    

x = df['highway-mpg']
y = df['price']

# Here we use a polynomial of the 3rd order (cubic)
f = np.polyfit(x, y, 3)
p = np.poly1d(f)
print(p)
#STUB – 3rd Order Function Call
# PlotPolly(p, x, y, 'highway-mpg')

np.polyfit(x, y, 3)

f1 = np.polyfit(x, y, 11)
p1 = np.poly1d(f1)
print(p)
#STUB - 11th Order Function Call
# PlotPolly(p1, x, y, 'highway-mpg 11th Order')


#We can perform a polynomial transform on multiple features. First, we import the module: Polynomial Features.

PolynomialFeatures = sklearn.preprocessing.PolynomialFeatures

pr = PolynomialFeatures(degree=2)
# print(pr)

Z_pr = pr.fit_transform(Z)
# print(Z.shape)
# print(Z_pr.shape)

#SECTION - Pipelines

#Data Pipelines simplify the steps of processing the data. We use the module Pipeline to create a pipeline. We also use StandardScaler as a step in our pipeline.

StandardScaler = sklearn.preprocessing.StandardScaler
Pipeline = sklearn.pipeline.Pipeline
LinearRegression = sklearn.linear_model.LinearRegression

Input = [('scale', StandardScaler()), ('polynomial', PolynomialFeatures(include_bias=False)), ('model', LinearRegression())]

pipe = Pipeline(Input)
# print(pipe)

Z = Z.astype(float)
pipe.fit(Z, y)

ypipe = pipe.predict(Z)
# print(ypipe[0:4])

Input = [('scale',StandardScaler()), ('model', LinearRegression())]

pipe = Pipeline(Input)
pipe.fit(Z, y)
ypipe = pipe.predict(Z)
# print(ypipe[0:10]

#!SECTION
#!SECTION
#!SECTION

#SECTION - Measures for In-Sample Evaluation
"""NOTE  – Measures for In-Sample Evaluation

When evaluating our models, not only do we want to visualize the results, but we also want a quantitative measure to determine how accurate the model is.

Two very important measures that are often used in Statistics to determine the accuracy of a model are:
- R^2/R-squared
- Mean Squared Error (MSE)

R-Squared

Also known as the coefficient of determination, is a measure to indicate how close the data is to the fitted regression line.

The value of the R-squared is the percentage of variation of the response variable (y) that is explained by a linear model.

Mean Squared Error (MSE)

The Mean Squared Error measures the average of the squares of errors. That is, the difference between actual value (y) and the estimated value (ŷ).
"""

#SECTION - Model 1: Simple Linear Regression

#Let's calculate the R^2:
lm.fit(X, Y)
# Find the R^2
print('The R-square is: ', lm.score(X, Y))

Yhat = lm.predict(X)
print('The output of the first four predicted value is: ', Yhat[0:4])

mean_squared_error = sklearn.metrics.mean_squared_error()

mse = mean_squared_error(df['price'], Yhat)

print('The mean square error of price and predicted value is: ', mse)
#!SECTION


#SECTION - Model 2: Multiple Linear Regression

# fit the model
lm.fit(Z, df['price'])
# Find the R^2
print('The R-square is: ', lm.score(Z, df['price']))

Y_predict_multifit = lm.predict(Z)

print('The mean square error of price and predicted value using multifit is: ', mean_squared_error(df['price'], Y_predict_multifit))

#!SECTION


#SECTION - Model 3: Polynomial Fit


r2_score = sklearn.metrics.r2_score
r_squared = r2_score(y, p(x))
print('The R-square value is: ', r_squared)

#!SECTION

#SECTION - MSE

mean_squared_error(df['price'], p(x))

#!SECTION

#SECTION - Prediction and Decision Making

new_input = np.arange(1, 100, 1).reshape(-1, 1)

lm.fit(X, Y)

yhat = lm.predict(new_input)
yhat[0:5]

plt.plot(new_input, yhat)
plt.show()
#!SECTION

Yhat =  [12809.86873425 18052.11958005 16086.27551287 19362.6822915
 13465.15008997 13465.15008997 14120.4314457  14120.4314457
 -2261.60244743  4291.21110982  4291.21110982  5601.77382127
  7567.61788844 12809.86873425  7567.61788844  7567.61788844
  7567.61788844 12809.86873425 16741.5568686  -2916.88380316
  7567.61788844  4946.49246554 10188.74331134 10188.74331134
 10188.74331134 10188.74331134 10844.02466707 10844.02466707
 10844.02466707 10844.02466707 14120.4314457  12154.58737852
 20017.96364722 12154.58737852  7567.61788844  7567.61788844
  7567.61788844  7567.61788844 11499.3060228  11499.3060228
 11499.3060228  11499.3060228  11499.3060228  14775.71280142
 16086.27551287 16086.27551287 16086.27551287 16086.27551287
 20673.24500295  5601.77382127  7567.61788844  7567.61788844
 12809.86873425 12809.86873425 11499.3060228  11499.3060228
 11499.3060228  12809.86873425 12809.86873425  8222.89924417
  -295.75838026  8222.89924417  8222.89924417  8222.89924417
  8222.89924417  822

TypeError: missing a required argument: 'y_true'

<Figure size 1200x1000 with 0 Axes>

<Figure size 1200x1000 with 0 Axes>

<Figure size 1200x1000 with 0 Axes>