# Import the Necessary Libraries/Packages

In [None]:
# Import the necessary libraries/packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier

# Load the Necessary Data Sets

In [None]:
# Load the MovieLens data sets
full_data = pd.read_csv("u.data", 
                        names=["user_id", "item_id", "rating", "timestamp"],
                        sep="\t",
                        header=None)

item_data = pd.read_csv("u.item",
                        names=["movie_id", "movie_title", "release_date", "video_release_date", "IMDb_url", "unknown",
                               "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama",
                               "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War",
                               "Western"],
                        sep="|",
                        header=None,
                        encoding="latin-1")

user_data = pd.read_csv("u.user",
                        names=["u_id", "age", "gender", "occupation", "zip_code"],
                        sep="|",
                        header=None,
                        encoding="latin-1")

print(f"The first 5 rows of u.data:\n {full_data.head()}")
print(" ")
print(f"The first 5 rows of u.item:\n {item_data.head()}")
print(" ")
print(f"The first 5 rows of u.user:\n {user_data.head()}")

# Merge the Data Sets

In [None]:
# Merge full_data and item_data by item/movie id
full_item_merged = pd.merge(full_data, item_data, left_on="item_id", right_on="movie_id")
full_item_merged = full_item_merged.drop("movie_id", axis=1)

# Merge full_item_merged and user_data by user id
final_merged = pd.merge(full_item_merged, user_data, left_on="user_id", right_on="u_id")
final_merged = final_merged.drop("u_id", axis=1)

print(final_merged.head())

# Drop the user_id and item_id columns

In [None]:
# Drop user_id and item_id
final_merged_clean = final_merged.drop(["user_id", "item_id"], axis=1)
print(final_merged_clean.columns)

# Identify Features with Missing Data, and Drop Those Features

In [None]:
# Identify features with missing data
print(final_merged_clean.isnull().sum())
print(" ")

# Drop release_date, video_release_date, and IMDb_url
final_merged_clean = final_merged_clean.drop(["release_date", "video_release_date", "IMDb_url"], axis=1)
print(final_merged_clean.columns)

My though process is, we don't want to sacrifice any ratings/movies/users from the dataset, but when predicting the ratings using all the input features, we don't want features with missing values because those missing values would have to be deleted/imputed/otherwise dealt with, which would potentially mess with the the integrity of the features themselves.

# Add an Intercept/Bias Column

In [None]:
# Add an additional intercept/bias column to the dataset
final_merged_clean["bias"] = 1
final_merged_clean.head()

# Gauge Feature Data Types

In [None]:
# Gauge the features' data types
print(final_merged_clean.dtypes)

The one feature I was going back-and-forth with in terms of keeping it categorical vs. numerical was zip_code, but after doing some research, I decided to keep it categorical because the numbers in a particular zip code are symbolic and don't assume any significant value.

Besides, some zip codes contain letters.

# Perform One-Hot Encoding

In [None]:
# One-hot encode the categorical features for the full dataset
onehot_movie = pd.get_dummies(final_merged_clean, drop_first=True, dtype=int)
onehot_movie.head()

# Transform the Data into a NumPy Array

In [None]:
# Turn the encoded data into a NumPy Array for ML purposes
onehot_movie = onehot_movie.to_numpy()
onehot_movie

# Perform Min-Max Scaling

In [None]:
# Scale all features except rating using Min-Max Scaling
scaler = MinMaxScaler()
onehot_movie_scale = scaler.fit_transform(onehot_movie[:, 1:]).round(2)
onehot_movie_scale_final = np.column_stack((onehot_movie_scale, full_data["rating"].values))
onehot_movie_scale_final

# Separate the Data into its Input Features and Target

In [None]:
# Separate data into input features and target
X = onehot_movie_scale_final[:, :-1]
y = onehot_movie_scale_final[:, -1]

# Implement a Multilayer Perceptron (MLP)

## Implement a Random Seed

In [None]:
# Implement a random seed to maintain the same values in the weight matrices
np.random.seed(8)

## Define the Features, Target, Step Size, Weight Matrices

In [None]:
X = X

y = y
eta = 0.1

W1 = np.random.randn(2500,10)
W2 = np.random.randn(10,1)

## Define the Activation Functions

In [None]:
# Use the ReLU activation function for h and the sigmoid activation function for the output
def f(x):
    h = np.maximum(0, W1.T.dot(x))
    return 1 / (1 + np.exp(-W2.T.dot(h)))

## Monitor Convergence

In [None]:
# NOTE: Best to run on Northeastern Cluster (takes a while to run)
# Keep track of gradient descent errors to monitor convergence
errors = []

# Instantiate a number of iterations/epochs to run the algorithm for
epochs = 500

# Obtain the number of nodes of the input layer
n = X.shape[0]

# Iterate over the number of iterations/epochs
for epoch in range(epochs):

    # Calculate the derivative with respect to w^(2)
    dW2 = 0
    for i, j in enumerate(y):
        x = np.reshape(X[i], (2500,1))
        h = np.maximum(0, W1.T.dot(x))
        dW2 += (1/n) * (f(x) - y[i])*h

    # Update w^(2) using the old value of w^(2) and the current values of h
    W2 = W2 - eta * dW2

    # Calculate the derivative with respect to W^(1)
    dW1 = 0
    for i, j in enumerate(y):
        x = np.reshape(X[i], (2500,1))
        h = np.maximum(0, W1.T.dot(x))
        mat1 = np.heaviside(h, 0)

        dW1 += (1/n) * np.kron((((f(x) - y[i])*W2) * mat1).T, x)

    # Update W^(1) using the old value of W^(1) and the current values of h
    W1 = W1 - eta * dW1

    # Calculate the gradient descent error
    e = (1/n) * np.sum((-y*np.log(f(X.T))) - ((1 - y)*np.log(1 - f(X.T))))
    errors.append(e)

## Display the Final Weight Estimates and Predicted Output

In [None]:
# Display the final estimate of W^(1)
print(W1)

In [None]:
# Display the final estimate of w^(2)
print(W2)

In [None]:
# Display the final predicted output
print(f(X.T))

## Visualize the Gradient Descent Errors vs. Number of Epochs

In [None]:
# Plot a line plot visualizing the gradient descent errors as a function of the number of epochs
plt.plot(range(42), errors, label="line")
plt.xlim(0, 42)
plt.xlabel("Epochs")
plt.ylim(0, 1.5)
plt.ylabel("Gradient Descent Errors")
plt.title("Multilayer Perceptron (MLP) Convergence")
plt.show()

## Predict the Ratings

In [None]:
# Predict using the data
clf = MLPClassifier(hidden_layer_sizes=(10,), activation="relu", learning_rate_init=0.01, max_iter=500, random_state=12).fit(X, y)
preds = clf.predict(X)
print(preds)

## Evaluate the MLP's Accuracy

In [None]:
# Evaluate the accuracy of the algorithm
clf.score(X, y)

# Gather the Final Results

In [None]:
# Obtain user id-item id-predicted movie rating combinations
data = {"user_id": final_merged["user_id"], "item_id": final_merged["item_id"], "predicted_rating": preds}

final_df = pd.DataFrame(data=data)

print(final_df.head())

In [None]:
# Save the resulting DataFrame as a .csv file
final_df.to_csv("mlp_results.csv", index=False)

In [None]:
# Read in the .csv file of predicted ratings
pred_ratings = pd.read_csv("data/mlp_results.csv")
pred_ratings.head()

In [None]:
# Confirm the predicted ratings match the ratings from the original MovieLens data
print(pred_ratings["predicted_rating"].unique())

In [None]:
# Plot the predicted ratings vs. original ratings
plt.figure()
plt.scatter(pred_ratings["predicted_rating"], full_data["rating"])
plt.xlabel("MLP Predicted Movie Rating")
plt.ylabel("True Movie Rating")
plt.title("MLP Predictions vs. True Values for Movie Rating")
plt.show()