In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")

# Load dataset
df = pd.read_csv("IMDb Movies India.csv", encoding="ISO-8859-1")

# Drop rows with essential missing values
df.dropna(subset=['Rating', 'Genre', 'Director', 'Votes'], inplace=True)

# Clean and convert 'Votes' column
df['Votes'] = df['Votes'].astype(str).str.replace(',', '')
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')
df.dropna(subset=['Votes'], inplace=True)

# Encode categorical features
genre_encoder = LabelEncoder()
director_encoder = LabelEncoder()
df['Genre_enc'] = genre_encoder.fit_transform(df['Genre'])
df['Director_enc'] = director_encoder.fit_transform(df['Director'])

# Prepare features and target
X = df[['Genre_enc', 'Director_enc', 'Votes']]
y = df['Rating']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print("✅ Model Evaluation")
print("Mean Squared Error:", round(mean_squared_error(y_test, y_pred), 3))
print("R² Score:", round(r2_score(y_test, y_pred), 3))

# Show feature importance
feature_importance = model.feature_importances_
print("\n📊 Feature Importance:")
for name, importance in zip(['Genre', 'Director', 'Votes'], feature_importance):
    print(f"{name}: {round(importance, 3)}")

# === Predict by Name === #
def predict_rating(genre_name, director_name, votes):
    try:
        genre_code = genre_encoder.transform([genre_name])[0]
        director_code = director_encoder.transform([director_name])[0]
        prediction = model.predict([[genre_code, director_code, votes]])[0]
        print(f"\n🎬 Predicted rating for {genre_name} movie by {director_name} with {votes} votes: {round(prediction, 2)}")
    except Exception as e:
        print("\n⚠️ Error:", e)
        print("Ensure the genre and director exist in the dataset.")

# Example prediction
predict_rating("Drama", "Karan Johar", 1500)

✅ Model Evaluation
Mean Squared Error: 1.653
R² Score: 0.14

📊 Feature Importance:
Genre: 0.267
Director: 0.381
Votes: 0.353

🎬 Predicted rating for Drama movie by Karan Johar with 1500 votes: 6.32
