In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# Multiple-output Multiple Linear Regression

Use of more than one feature variable:
$Y_i = Bias_0 + Weight_1 Feature_1 + Weight_2 Feature_2 + \ldots + Weight_p Feature_p$

In [None]:
movies = pd.read_csv("Resources/imdb_final.csv")
movies = movies[["year","genre","duration","director","budget","rating_class"]]
movies.head()

In [None]:
movies.replace("Bad",0, inplace=True)
movies.replace("Good",1, inplace=True)
movies.replace("Excellent",2, inplace=True)
movies.tail()

### Shape the data

In [None]:
X = movies[["year","genre","duration","director","budget"]]
y = movies["rating_class"].values.reshape(-1,1)
print(X.shape, y.shape)

In [None]:
data = X.copy()
data

### Preprocessing Data, Label Encoding, and Scaling
dummy encoding of data to translate it from categorical to numerical.
Encode teh data using labelencoder
Scale the model

In [None]:
data_binary_encoded = pd.get_dummies(data, columns= ["genre", "director"])
data_binary_encoded.head()

### Use train_test_split to create training data and testing data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_binary_encoded, y, random_state=42)

In [None]:
#label encoding
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train).reshape(-1,1)
encoded_y_test = label_encoder.transform(y_test).reshape(-1,1)
encoded_y_train

In [None]:
# Create StandardScaler model and fit to training data
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(encoded_y_train)

# Transform training and testing data using X_scaler and y_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(encoded_y_train)
y_test_scaled = y_scaler.transform(encoded_y_test)

print(X_train_scaled[0])

In [None]:
from tensorflow.keras.utils import to_categorical
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)
y_train_categorical[0]

### Create model using Linear Regression

In [None]:
model = LinearRegression()

### Fit the model to the training data and calculate the scores for the training and testing data

In [None]:
model.fit(X_train_scaled, y_train_categorical)

training_score = model.score(X_train_scaled, y_train_categorical)
testing_score = model.score(X_test_scaled, y_test_categorical)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

### Plot the residuals for the training and testing data

In [None]:
plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_categorical, c="blue", label="Training Data")
plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_categorical, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y.min(), xmax=y.max())
plt.title("Residual Plot")