In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# Load dataset (replace 'file_path' with actual path in Colab)
file_path = "IMDb Movies India.csv"
df = pd.read_csv(file_path, encoding="latin1")

In [None]:
# Display basic info and first few rows
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


(None,
                                  Name    Year Duration            Genre  \
 0                                         NaN      NaN            Drama   
 1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
 2                         #Homecoming  (2021)   90 min   Drama, Musical   
 3                             #Yaaram  (2019)  110 min  Comedy, Romance   
 4                   ...And Once Again  (2010)  105 min            Drama   
 
    Rating Votes            Director       Actor 1             Actor 2  \
 0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
 1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
 2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
 3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
 4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   
 
            Actor 3  
 0  Rajendra Bhatia  
 1    Arvind Jangid  
 2       Roy Angana  
 

In [None]:
# Data Cleaning & Preprocessing
# Extract year as numeric

df["Year"] = df["Year"].str.extract("(\\d+)").astype("float")
# Convert 'Votes' and 'Duration' to numeric
df["Votes"] = pd.to_numeric(df["Votes"], errors="coerce")
df["Duration"] = df["Duration"].str.replace(" min", "", regex=False).astype("float")

In [None]:
# Fill missing categorical values with 'Unknown'
categorical_cols = ["Genre", "Director", "Actor 1", "Actor 2", "Actor 3"]
df[categorical_cols] = df[categorical_cols].fillna("Unknown")

In [None]:
# Fill missing numeric values with median
numeric_cols = ["Year", "Duration", "Rating", "Votes"]
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

In [None]:
# Display cleaned data info
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      15509 non-null  float64
 2   Duration  15509 non-null  float64
 3   Genre     15509 non-null  object 
 4   Rating    15509 non-null  float64
 5   Votes     15509 non-null  float64
 6   Director  15509 non-null  object 
 7   Actor 1   15509 non-null  object 
 8   Actor 2   15509 non-null  object 
 9   Actor 3   15509 non-null  object 
dtypes: float64(4), object(6)
memory usage: 1.2+ MB


(None,
                                  Name    Year  Duration            Genre  \
 0                                      1991.0     131.0            Drama   
 1  #Gadhvi (He thought he was Gandhi)  2019.0     109.0            Drama   
 2                         #Homecoming  2021.0      90.0   Drama, Musical   
 3                             #Yaaram  2019.0     110.0  Comedy, Romance   
 4                   ...And Once Again  2010.0     105.0            Drama   
 
    Rating  Votes            Director       Actor 1             Actor 2  \
 0     6.0   35.0       J.S. Randhawa      Manmauji              Birbal   
 1     7.0    8.0       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
 2     6.0   35.0  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
 3     4.4   35.0          Ovais Khan       Prateik          Ishita Raj   
 4     6.0   35.0        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   
 
            Actor 3  
 0  Rajendra Bhatia  
 1    Arvind Jangid  
 2       Ro

In [None]:
# Feature Engineering
# Compute Director Success Rate (average rating of director's previous movies)
director_avg_rating = df.groupby("Director")["Rating"].mean().to_dict()
df["Director_Success_Rate"] = df["Director"].map(director_avg_rating)


In [None]:
# Compute Average Rating of Similar Movies (based on Genre)
genre_avg_rating = df.groupby("Genre")["Rating"].mean().to_dict()
df["Genre_Avg_Rating"] = df["Genre"].map(genre_avg_rating)

In [None]:
# Encode categorical features
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [None]:
# Define features and target variable
X = df.drop(columns=["Name", "Rating"])  # Exclude 'Name' and target variable
y = df["Rating"]

In [None]:
# Display updated dataset
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Name                   15509 non-null  object 
 1   Year                   15509 non-null  float64
 2   Duration               15509 non-null  float64
 3   Genre                  15509 non-null  int64  
 4   Rating                 15509 non-null  float64
 5   Votes                  15509 non-null  float64
 6   Director               15509 non-null  int64  
 7   Actor 1                15509 non-null  int64  
 8   Actor 2                15509 non-null  int64  
 9   Actor 3                15509 non-null  int64  
 10  Director_Success_Rate  15509 non-null  float64
 11  Genre_Avg_Rating       15509 non-null  float64
dtypes: float64(6), int64(5), object(1)
memory usage: 1.4+ MB


(None,
                                  Name    Year  Duration  Genre  Rating  Votes  \
 0                                      1991.0     131.0    299     6.0   35.0   
 1  #Gadhvi (He thought he was Gandhi)  2019.0     109.0    299     7.0    8.0   
 2                         #Homecoming  2021.0      90.0    351     6.0   35.0   
 3                             #Yaaram  2019.0     110.0    228     4.4   35.0   
 4                   ...And Once Again  2010.0     105.0    299     6.0   35.0   
 
    Director  Actor 1  Actor 2  Actor 3  Director_Success_Rate  \
 0      1926     2250      800     3108               5.850000   
 1      1548     3280     4791      527               7.000000   
 2      5123     3713     2866     3450               6.000000   
 3      3319     2917     1504     4020               4.400000   
 4       385     3112     3462      405               6.285714   
 
    Genre_Avg_Rating  
 0          6.149065  
 1          6.149065  
 2          6.366667  
 3       

In [None]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train Initial Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = model.predict(X_test)

In [None]:
# Evaluate Initial Model
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)
print(f"Initial Model - MAE: {mae:.3f}, RMSE: {rmse:.3f}, R2: {r2:.3f}")

Initial Model - MAE: 0.329, RMSE: 0.603, R2: 0.626


In [None]:
# Hyperparameter Tuning
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, n_jobs=-1, scoring="r2")
grid_search.fit(X_train, y_train)

In [None]:
# Train Best Model
best_params = grid_search.best_params_
best_model = RandomForestRegressor(**best_params, random_state=42)
best_model.fit(X_train, y_train)

In [None]:
# Make Predictions with Best Model
y_pred_best = best_model.predict(X_test)

In [None]:
# Evaluate Improved Model
mae_best = mean_absolute_error(y_test, y_pred_best)
rmse_best = mean_squared_error(y_test, y_pred_best) ** 0.5
r2_best = r2_score(y_test, y_pred_best)
print(f"Tuned Model - MAE: {mae_best:.3f}, RMSE: {rmse_best:.3f}, R2: {r2_best:.3f}")

Tuned Model - MAE: 0.325, RMSE: 0.600, R2: 0.630
