In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Multiple Linear Regression

Use of more than one feature variable:
$Y_i = Bias_0 + Weight_1 Feature_1 + Weight_2 Feature_2 + \ldots + Weight_p Feature_p$

In [None]:
movies = pd.read_csv("Resources/imdb_final.csv")
movies.head()

# Preprocessing 
### Label Encoding and One-Hot Encoding
The data is categorical and must be converted to an array. this is done by doing the following: 

1. **Label Encoding**. First, we needed to identify the number of classes to integers for each of the categories that needed to be preprocessed. 
- From the iris example: convert the three possible classes to integer labels: E.g., `iris-setosa` will be `1`; `iris-versicolour`, `2`; and `iris-virginica`, `3`.
2. **One-Hot Encoding**. Then, we set each row's `class` value to an _array_. This array will have a `1` in whichever slot corresponds to the integer label. 

In [None]:
# Step 1: Reformat data (this is from the Iris example, there were three iris categories)
data = df.values
X = data[:, 0:4]
y = data[:, 4]

from sklearn.preprocessing import LabelEncoder

# Step 2: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

for label, original_class in zip(encoded_y, y):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

In [None]:
from keras.utils import to_categorical

# Step 3: One-hot encoding
one_hot_y = to_categorical(encoded_y)
one_hot_y

### Shape the data

In [None]:
X = movies[["year","genre","duration","country","director","production_company","budget"]]
y = movies["median_vote"].values.reshape(-1,1)
print(X.shape, y.shape)

### Use train_test_split to create training data and testing data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

### Create model using Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

### Fit the model to the training data and calculate the scores for the training and testing data

In [None]:
model.fit(X_train, y_train)
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

### Plot the residuals for the training and testing data

In [None]:
plt.scatter(model.predict(X_train), model.predict(X_train) - y_train, c="blue", label="Training Data")
plt.scatter(model.predict(X_test), model.predict(X_test) - y_test, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y.min(), xmax=y.max())
plt.title("Residual Plot")