In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")

# Load dataset
df = pd.read_csv("IMDb Movies India.csv", encoding="ISO-8859-1")

# Drop rows with essential missing values
df.dropna(subset=['Rating', 'Genre', 'Director', 'Votes', 'Actor 1', 'Actor 2', 'Actor 3'], inplace=True)

# Clean and convert 'Votes' column
df['Votes'] = df['Votes'].astype(str).str.replace(',', '')
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')
df.dropna(subset=['Votes'], inplace=True)

# Encode categorical features
le_genre = LabelEncoder()
le_director = LabelEncoder()
le_actor1 = LabelEncoder()
le_actor2 = LabelEncoder()
le_actor3 = LabelEncoder()

df['Genre_enc'] = le_genre.fit_transform(df['Genre'])
df['Director_enc'] = le_director.fit_transform(df['Director'])
df['Actor1_enc'] = le_actor1.fit_transform(df['Actor 1'])
df['Actor2_enc'] = le_actor2.fit_transform(df['Actor 2'])
df['Actor3_enc'] = le_actor3.fit_transform(df['Actor 3'])

# Feature matrix and target
X = df[['Genre_enc', 'Director_enc', 'Actor1_enc', 'Actor2_enc', 'Actor3_enc', 'Votes']]
y = df['Rating']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print("✅ Model Evaluation")
print("MSE:", round(mean_squared_error(y_test, y_pred), 2))
print("R² Score:", round(r2_score(y_test, y_pred), 2))

# Predict rating based on new inputs
def predict_rating(genre, director, actor1, actor2, actor3, votes):
    try:
        g = le_genre.transform([genre])[0]
        d = le_director.transform([director])[0]
        a1 = le_actor1.transform([actor1])[0]
        a2 = le_actor2.transform([actor2])[0]
        a3 = le_actor3.transform([actor3])[0]
        prediction = model.predict([[g, d, a1, a2, a3, votes]])[0]
        print(f"🎬 Predicted Rating: {round(prediction, 2)}")
    except Exception as e:
        print("❌ Error:", e)
        print("Ensure genre, director, and actor names exist in the dataset.")

# Example usage
predict_rating("Drama", "Anurag Kashyap", "Nawazuddin Siddiqui", "Radhika Apte", "Rajkummar Rao", 1000)

✅ Model Evaluation
MSE: 1.57
R² Score: 0.15
🎬 Predicted Rating: 7.11
