# Multiple Linear Regression

Use of more than one feature variable:
$Y_i = Bias_0 + Weight_1 Feature_1 + Weight_2 Feature_2 + \ldots + Weight_p Feature_p$

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
movies = pd.read_csv("Resources/imdb_final.csv")
movies = movies[["year","genre","duration","director","budget","rating_class","median_vote"]]
movies.head()

### Identify if there is a linear relationship between any of the variables

In [None]:
plt.scatter(movies["median_vote"],movies["genre"])
plt.xlabel('Rating')
plt.ylabel('Genre')
plt.show()

In [None]:
plt.scatter(movies["median_vote"],movies["duration"])
plt.xlabel('Rating')
plt.ylabel('Duration')
plt.show()

In [None]:
plt.scatter(movies["median_vote"],movies["director"])
plt.xlabel('Rating')
plt.ylabel('Director')
plt.show()

In [None]:
plt.scatter(movies["median_vote"],movies["budget"])
plt.xlabel('Rating')
plt.ylabel('Budget')
plt.show()

### There is not a linear relationship between the independent variables and the dependent variable, therefore the multiple linear regression will not function as intended.

In [None]:
movies.replace("Bad",0, inplace=True)
movies.replace("Good",1, inplace=True)
movies.replace("Excellent",2, inplace=True)
movies.tail()

### Shape the data

In [None]:
X = movies[["year","genre","duration","director","budget"]]
y = movies["median_vote"].values.reshape(-1,1)
print(X.shape, y.shape)

In [None]:
data = X.copy()
data

### Preprocessing Data and Scaling
- Dummy encoding of data to translate it from categorical to numerical.
- Encode the data using pandas get_dummies
- Split the data into traing and test data using train_test_split
- Scale the model

In [None]:
data_binary_encoded = pd.get_dummies(data, columns= ["genre", "director"])
data_binary_encoded.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_binary_encoded, y, random_state=42)

In [None]:
# Create StandardScaler model and fit to training data
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

# Transform training and testing data using X_scaler and y_scaler
X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)
y_train = y_scaler.transform(y_train)
y_test = y_scaler.transform(y_test)

### Create model using Linear Regression
- Fit the model to the training data and calculate the scores for the training and testing data
- Review metrics (MSE and R2) of model

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train, y_train)

training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

predictions = model.predict(X_test)
MSE = mean_squared_error(y_test, predictions)
r2 = model.score(X_test, y_test)

print(f"MSE: {MSE}, R2: {r2}")

### Plot the residuals for the training and testing data

In [None]:
plt.scatter(model.predict(X_train), y_train - model.predict(X_train), c="blue", label="Training Data")
plt.scatter(model.predict(X_test), y_test - model.predict(X_test), c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test.min(), xmax=y_test.max())
plt.title("Residual Plot")
plt.show()
