# Question 3
 Pick 30 movies in the middle of the COD range, as identified by question 1 (that were not used in question 2). Now build a regularized regression model with the ratings from 10 other movies (picked randomly, or deliberately by you) as an input. Please use ridge regression, and make sure to do suitable hyperparameter tuning. Also make sure to report the RMSE for each of these 30 movies in a table, after doing an 80/20 train/test split. Comment on the hyperparameters you use and betas you find by doing so.

In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
import random

# Load the movie ratings dataset
data = pd.read_csv('/Users/lobster/Desktop/Movie Replication Set.csv')

# Load the results_df from the CSV file
results_df = pd.read_csv('/Users/lobster/Desktop/results_df.csv')

# Select 30 movies from the middle of the COD range
mid_range_movies = results_df.sort_values('COD').iloc[185:215]['Target Movie']

# Initialize the DataFrame to store RMSE for each movie
rmse_df = pd.DataFrame(columns=['Movie', 'RMSE', 'Best Alpha'])

# Select 10 random movies to use as predictors (outside the loop)
all_movies = list(set(data.columns))
predictor_movies = random.sample(all_movies, 10)

# Iterate over the 30 movies
for movie in mid_range_movies:
    # Prepare the data with the fixed predictor movies
    X = data[predictor_movies]
    y = data[movie]

    # Handle missing values in X
    imputer = SimpleImputer(strategy='mean') # or 'median', 'most_frequent', etc.
    X_imputed = imputer.fit_transform(X)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

    # Ridge regression with hyperparameter tuning
    parameters = {'alpha': [0.01, 0.1, 1, 10, 100]}
    ridge = Ridge()
    clf = GridSearchCV(ridge, parameters, scoring='neg_mean_squared_error')
    clf.fit(X_train, y_train)

    # Best model
    best_alpha = clf.best_params_['alpha']
    best_model = clf.best_estimator_

    # Predict and calculate RMSE
    y_pred = best_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Append to DataFrame
    rmse_df = pd.concat([rmse_df, pd.DataFrame({'Movie': [movie], 'RMSE': [rmse], 'Best Alpha': [best_alpha]})], ignore_index=True)

# Print the results
print(rmse_df)


                                     Movie      RMSE Best Alpha
0                             Rocky (1976)  0.561674         10
1                          Big Fish (2003)  0.359195         10
2                       Taxi Driver (1976)  0.400831         10
3                        Braveheart (1995)  0.429668        100
4               Lost in Translation (2003)  0.446424          1
5                        Crossroads (2002)  0.343732         10
6                          Magnolia (1999)  0.399788         10
7                  Just Like Heaven (2005)  0.357410         10
8             Gone in Sixty Seconds (2000)  0.336190         10
9                        Armageddon (1998)  0.407875        100
10                       The Others (2001)  0.347105         10
11                        Cast Away (2000)  0.387006        100
12                         The Mist (2007)  0.392115         10
13                            Honey (2003)  0.437530         10
14                    Baby Geniuses (199