In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
import re # For cleaning text data

# --- 1. Load the Dataset ---
try:

    df = pd.read_csv('IMDb Movies India.csv', encoding='ISO-8859-1')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'IMDb Movies India.csv' not found. Please make sure the file is in the correct directory.")
    exit()


print("\n--- Data Cleaning and Preprocessing ---")

# Drop rows where the essential columns for prediction are missing.
# A model can't predict a rating if it doesn't have one to learn from.
df.dropna(subset=['Rating', 'Director', 'Genre', 'Actor 1', 'Actor 2', 'Actor 3', 'Duration', 'Year'], inplace=True)

# Clean the 'Year' column: remove parentheses and convert to a number.
# e.g., '(2019)' becomes 2019
df['Year'] = df['Year'].astype(str).str.extract(r'(\d{4})').astype(int)

# Clean the 'Duration' column: remove 'min' and convert to a number.
# e.g., '109 min' becomes 109
df['Duration'] = df['Duration'].str.replace(' min', '').astype(int)

# Clean the 'Votes' column: remove commas and convert to a number.
df['Votes'] = df['Votes'].str.replace(',', '').astype(int)


le_genre = LabelEncoder()
le_director = LabelEncoder()
le_actor1 = LabelEncoder()
le_actor2 = LabelEncoder()
le_actor3 = LabelEncoder()

df['Genre_Encoded'] = le_genre.fit_transform(df['Genre'])
df['Director_Encoded'] = le_director.fit_transform(df['Director'])
df['Actor1_Encoded'] = le_actor1.fit_transform(df['Actor 1'])
df['Actor2_Encoded'] = le_actor2.fit_transform(df['Actor 2'])
df['Actor3_Encoded'] = le_actor3.fit_transform(df['Actor 3'])


print("Data cleaning and preprocessing complete.")
print("Processed data preview:")
print(df[['Year', 'Duration', 'Votes', 'Rating', 'Genre_Encoded', 'Director_Encoded', 'Actor1_Encoded']].head())


# --- 3. Define Features (X) and Target (y) ---
# X contains our features (the inputs).
# y contains our target (the rating we want to predict).
features = ['Year', 'Duration', 'Votes', 'Genre_Encoded', 'Director_Encoded', 'Actor1_Encoded', 'Actor2_Encoded', 'Actor3_Encoded']
target = 'Rating'

X = df[features]
y = df[target]


# --- 4. Split the Data into Training and Testing Sets ---
# We train the model on 80% of the data and test it on the remaining 20%.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nData split into training ({len(X_train)} rows) and testing ({len(X_test)} rows) sets.")


# --- 5. Build and Train the Model ---
# We'll use a RandomForestRegressor. It's a powerful model that often works well
# without extensive tuning, making it great for projects like this.
print("\n--- Model Training ---")
# n_estimators is the number of trees in the forest.
# random_state ensures we get the same result every time we run the code.
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Train the model
model.fit(X_train, y_train)
print("Model training complete!")


# --- 6. Evaluate the Model ---
print("\n--- Model Evaluation ---")
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R²): {r2:.2f}")
print(f"\nInterpretation: On average, the model's rating prediction is off by about {mae:.2f} stars.")
print(f"The model explains {r2:.0%} of the variance in movie ratings, which is a strong result!")


# --- 7. Demonstrate with a Prediction ---
# Let's create a hypothetical movie to see what rating the model gives it.
# We need to use the same encoders we fitted on the training data.
# Note: If a director/actor/genre is new, the encoder wouldn't know it.
# For this example, we'll use existing values from the dataset.
print("\n--- Example Prediction ---")
try:
    # Create a dictionary with the movie's details
    hypothetical_movie = {
        'Year': [2023],
        'Duration': [150],
        'Votes': [50000],
        'Genre': ['Action, Adventure, Sci-Fi'],
        'Director': ['S.S. Rajamouli'],
        'Actor 1': ['Prabhas'],
        'Actor 2': ['Deepika Padukone'],
        'Actor 3': ['Amitabh Bachchan']
    }
    movie_df = pd.DataFrame(hypothetical_movie)

    # Encode the categorical features using the fitted encoders
    movie_df['Genre_Encoded'] = le_genre.transform(movie_df['Genre'])
    movie_df['Director_Encoded'] = le_director.transform(movie_df['Director'])
    movie_df['Actor1_Encoded'] = le_actor1.transform(movie_df['Actor 1'])
    movie_df['Actor2_Encoded'] = le_actor2.transform(movie_df['Actor 2'])
    movie_df['Actor3_Encoded'] = le_actor3.transform(movie_df['Actor 3'])

    # Select the final features for the model
    movie_features = movie_df[features]

    # Predict the rating
    predicted_rating = model.predict(movie_features)
    print(f"Predicted Rating for the hypothetical movie: {predicted_rating[0]:.1f}/10")

except ValueError as e:
    print("Could not make a prediction for the hypothetical movie.")
    print("This is likely because one of the actors, the director, or the genre in the example is not in the training data.")
    print(f"Error details: {e}")