# Question 4
Repeat question 3) with LASSO regression. Again, make sure to comment on the hyperparameters you use and betas you find by doing so.

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
import random

# Load the movie ratings dataset
data = pd.read_csv('/Users/lobster/Desktop/Movie Replication Set.csv')

# Load the results_df from the CSV file
results_df = pd.read_csv('/Users/lobster/Desktop/results_df.csv')

# Select 30 movies from the middle of the COD range
mid_range_movies = results_df.sort_values('COD').iloc[185:215]['Target Movie']

# Initialize the DataFrame to store RMSE for each movie
lasso_rmse_df = pd.DataFrame(columns=['Movie', 'RMSE', 'Best Alpha'])

# Select 10 random movies to use as predictors (outside the loop)
all_movies = list(set(data.columns) - set(mid_range_movies))
predictor_movies = random.sample(all_movies, 10)

# Iterate over the 30 movies
for movie in mid_range_movies:
    # Prepare the data with the fixed predictor movies
    X = data[predictor_movies]
    y = data[movie]

    # Handle missing values in X
    imputer = SimpleImputer(strategy='mean')
    X_imputed = imputer.fit_transform(X)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

    # LASSO regression with hyperparameter tuning
    parameters = {'alpha': [0.01, 0.1, 1, 10, 100]}
    lasso = Lasso()
    clf = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error')
    clf.fit(X_train, y_train)

    # Best model
    best_alpha = clf.best_params_['alpha']
    best_model = clf.best_estimator_

    # Predict and calculate RMSE
    y_pred = best_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Append to DataFrame
    lasso_rmse_df = pd.concat([lasso_rmse_df, pd.DataFrame({'Movie': [movie], 'RMSE': [rmse], 'Best Alpha': [best_alpha]})], ignore_index=True)

# Print the results
print(lasso_rmse_df)


                                     Movie      RMSE  Best Alpha
0                             Rocky (1976)  0.537241        0.01
1                          Big Fish (2003)  0.357772        0.01
2                       Taxi Driver (1976)  0.348372        0.01
3                        Braveheart (1995)  0.386293        0.01
4               Lost in Translation (2003)  0.396276        0.01
5                        Crossroads (2002)  0.305125        0.01
6                          Magnolia (1999)  0.380668        0.01
7                  Just Like Heaven (2005)  0.352385        0.01
8             Gone in Sixty Seconds (2000)  0.320425        0.01
9                        Armageddon (1998)  0.400121        0.01
10                       The Others (2001)  0.398531        0.01
11                        Cast Away (2000)  0.372031        0.01
12                         The Mist (2007)  0.353578        0.01
13                            Honey (2003)  0.399347        0.01
14                    Bab