In [2]:
file_path = "/content/IMDbMoviesIndia.csv"

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import scipy.sparse

df = pd.read_csv('IMDbMoviesIndia.csv', encoding='latin1')

print("Original DataFrame head:")
display(df.head())
print("\nOriginal DataFrame info:")
df.info()

df['Genre'].fillna('', inplace=True)
df['Director'].fillna('', inplace=True)
df['Actor 1'].fillna('', inplace=True)
df['Actor 2'].fillna('', inplace=True)
df['Actor 3'].fillna('', inplace=True)

df.dropna(subset=['Rating'], inplace=True)

print(f"\nDataFrame after handling missing values. Shape: {df.shape}")

df['cleaned_genre'] = df['Genre'].apply(lambda x: ' '.join(genre.strip() for genre in x.split(',')))
genre_vectorizer = CountVectorizer(tokenizer=lambda x: x.split(' '), token_pattern=None)
genre_features = genre_vectorizer.fit_transform(df['cleaned_genre'])
print(f"\nShape of genre features: {genre_features.shape}")

director_encoder = LabelEncoder()
df['Director_encoded'] = director_encoder.fit_transform(df['Director'])
director_features_sparse = scipy.sparse.csr_matrix(df[['Director_encoded']].values)
print(f"Shape of director features: {director_features_sparse.shape}")

df['all_actors'] = df['Actor 1'] + ',' + df['Actor 2'] + ',' + df['Actor 3']
df['all_actors'] = df['all_actors'].apply(lambda x: ','.join(filter(None, [a.strip() for a in x.split(',')])))
df['all_actors'] = df['all_actors'].apply(lambda x: ' '.join(actor.strip() for actor in x.split(',')))

actor_vectorizer = CountVectorizer(tokenizer=lambda x: x.split(' '), token_pattern=None)
actor_features = actor_vectorizer.fit_transform(df['all_actors'])
print(f"Shape of actor features: {actor_features.shape}")

X_combined = scipy.sparse.hstack([genre_features, director_features_sparse, actor_features])
y = df['Rating']

print(f"\nShape of combined feature matrix (X_combined): {X_combined.shape}")
print(f"Shape of target variable (y): {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

print("\nTraining RandomForestRegressor...")
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
print("Model training complete.")

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\n--- Model Evaluation ---")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R2): {r2:.4f}")

print("\nThis R-squared value indicates how well the model explains the variability of the target variable. A higher R-squared (closer to 1) suggests a better fit.")


Original DataFrame head:


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali



Original DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB

DataFrame after handling missing values. Shape: (7919, 10)

Shape of genre features: (7919, 23)
Shape of director features: (7919, 1)
Shape of actor features: (7919, 5581)

Shape of combined feature matrix (X_combined): (7919, 5605)
Shape of target variable (y): (7919,)

Training set size: 6335 samples
Testing set size: 1584 samples

Training

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Genre'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Director'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves a

Model training complete.

--- Model Evaluation ---
Mean Absolute Error (MAE): 0.9251
Mean Squared Error (MSE): 1.4338
Root Mean Squared Error (RMSE): 1.1974
R-squared (R2): 0.2288

This R-squared value indicates how well the model explains the variability of the target variable. A higher R-squared (closer to 1) suggests a better fit.
