## Import Libraries

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## Load the Dataset


In [None]:
df = pd.read_csv("data/linear.csv")
df.head()

##  Remove Missing or Impute Values


In [None]:
# Look at the shape of the dataframe
df.shape

In [None]:
# There are missing values in the y column which is what we will predict 
df.isnull().sum()

In [None]:
# Remove entire rows from dataframe if they contain any nans in them or 'all'
# this may not be the best strategy for our dataset
df = df.dropna(how = 'any')

In [None]:
# There are no more missing values
df.isnull().sum()

In [None]:
df.shape

## Arrange Data into Features Matrix and Target Vector

In [None]:
# Convert x column to numpy array
X = df.loc[:, ['x']].values

In [None]:
# Features Matrix needs to be at 2 dimensional
X.shape

In [None]:
y = df.loc[:, 'y'].values

In [None]:
y.shape

## Linear Regression


In [None]:
# Make a linear regression instance
reg = LinearRegression(fit_intercept=True)

In [None]:
reg.fit(X,y)

Predict for One Observation

In [None]:
# Input needs to be two dimensional (reshape makes input two dimensional )
reg.predict(X[0].reshape(-1,1))

Predict for Multiple Observations at Once

In [None]:
reg.predict(X[0:10])

## Measuring Model Performance

In [None]:
score = reg.score(X, y)
print(score)

## What is the equation of the line for the regression?



In [None]:
reg.coef_

In [None]:
reg.intercept_

In [None]:
m = reg.coef_[0]

b = reg.intercept_

# following slope intercept form 
print("formula: y = {:.2f}x + {:.2f}".format(m, b) )

## Plotting the Best Fit Linear Regression Line in Red

In [None]:
fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (10,7));

ax.scatter(X, y, color='black');
ax.plot(X, reg.predict(X), color='red',linewidth=3);
ax.grid(True,
        axis = 'both',
        zorder = 0,
        linestyle = ':',
        color = 'k')
ax.tick_params(labelsize = 18)
ax.set_xlabel('x', fontsize = 24)
ax.set_ylabel('y', fontsize = 24)
ax.set_title("Linear Regression Line with Intercept y = {:.2f}x + {:.2f} (R2 = {:.2f})".format(m, b, score), fontsize = 16 )
fig.tight_layout()
#fig.savefig('images/linearregression', dpi = 300)

### Plotting Models With or Without Intercept
In this section, you will see how changing a hyperparameter value can have a drastic impact on the R2 

In [None]:
# Model with Intercept (like earlier in notebook)
reg_inter = LinearRegression(fit_intercept=True)
reg_inter.fit(X,y)
predictions_inter = reg_inter.predict(X)
score_inter = reg_inter.score(X, y)

In [None]:

fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize = (10,7));

for index, model in enumerate([LinearRegression(fit_intercept=True), LinearRegression(fit_intercept=False)]): 
    model.fit(X,y)
    predictions = model.predict(X)
    score = model.score(X, y)
    m = model.coef_[0]
    b = model.intercept_
    
    ax[index].scatter(X, y, color='black');
    ax[index].plot(X, model.predict(X), color='red',linewidth=3);

    ax[index].tick_params(labelsize = 18)
    ax[index].set_xlabel('x', fontsize = 18)
    ax[index].set_ylabel('y', fontsize = 18)
    ax[index].set_xlim(left = 0, right = 150)
    ax[index].set_ylim(bottom = 0)
    
    ax[index].text(50, 10, " y={:.2f}x+{:.2f} (R2={:.2f})".format(m, b, score), fontsize = 12)

ax[0].set_title('fit_intercept = True', fontsize = 20)   
ax[1].set_title('fit_intercept = False',  fontsize = 20)    
fig.tight_layout()