In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# -------------------- DATASET --------------------
data = {
    "Movie": [
        "Interstellar", "Titanic", "Avengers", "Inception", "Joker",
        "Avatar", "Gladiator", "Notebook", "Matrix", "Gravity",
        "The Dark Knight", "Iron Man", "Forrest Gump", "Parasite",
        "La La Land", "Mad Max Fury Road", "The Godfather",
        "Shutter Island", "The Martian", "John Wick"
    ],
    "Genre": [
        "Sci-Fi", "Romance", "Action", "Sci-Fi", "Drama",
        "Sci-Fi", "Action", "Romance", "Sci-Fi", "Sci-Fi",
        "Action", "Action", "Drama", "Drama",
        "Romance", "Action", "Drama",
        "Thriller", "Sci-Fi", "Action"
    ],
    "Duration": [
        169, 195, 181, 148, 122,
        162, 155, 123, 136, 91,
        152, 126, 142, 132,
        128, 120, 175,
        138, 144, 101
    ],
    "Rating": [
        8.6, 7.8, 8.4, 8.8, 8.5,
        7.9, 8.5, 7.9, 8.7, 7.7,
        9.0, 7.9, 8.8, 8.6,
        8.0, 8.1, 9.2,
        8.2, 8.0, 7.9
    ]
}

df = pd.DataFrame(data)

# -------------------- ENCODING --------------------
df_encoded = pd.get_dummies(df, columns=["Genre"])

X = df_encoded.drop(["Movie", "Rating"], axis=1)
y = df_encoded["Rating"]

# -------------------- TRAIN-TEST SPLIT --------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------------------- MODEL TRAINING --------------------
model = LinearRegression()
model.fit(X_train, y_train)

# -------------------- MODEL EVALUATION --------------------
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print("Mean Squared Error:", mse)

# -------------------- NEW MOVIE PREDICTION --------------------
new_movie = {
    "Movie": "Neo Horizon",
    "Genre": "Sci-Fi",
    "Duration": 150
}

new_movie_df = pd.DataFrame([new_movie])
new_movie_encoded = pd.get_dummies(new_movie_df)

new_movie_encoded = new_movie_encoded.reindex(
    columns=X.columns,
    fill_value=0
)

new_movie_rating = model.predict(new_movie_encoded)

print(
    "Predicted Rating for New Movie:",
    round(new_movie_rating[0], 2)
)

# -------------------- RECOMMENDATION SYSTEM --------------------
df["Predicted_Rating"] = model.predict(X).round(2)

recommended_movies = df.sort_values(
    by="Predicted_Rating",
    ascending=False
)

top_5_movies = recommended_movies.head(5)
top_5_movies


Mean Squared Error: 0.1688576002303194
Predicted Rating for New Movie: 8.35


Unnamed: 0,Movie,Genre,Duration,Rating,Predicted_Rating
16,The Godfather,Drama,175,9.2,9.08
12,Forrest Gump,Drama,142,8.8,8.77
2,Avengers,Action,181,8.4,8.7
13,Parasite,Drama,132,8.6,8.67
1,Titanic,Romance,195,7.8,8.61
