In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from imblearn.under_sampling import RandomUnderSampler


In [2]:
df = pd.read_csv(r"C:\Users\koush\Downloads\IMDb Movies India.csv\IMDb Movies India.csv", encoding='latin1')

In [3]:
print(f"The dataset has {df.shape[0]} rows and {df.shape[1]} columns.")
print("Columns:", df.columns.tolist())

The dataset has 15509 rows and 10 columns.
Columns: ['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']


In [4]:
df = df[['Genre', 'Director', 'Actor 1', 'Year', 'Rating']]

In [5]:
df.dropna(inplace=True)

In [6]:
df = df[df['Rating'].apply(lambda x: str(x).replace('.', '', 1).isdigit())]
df['Rating'] = df['Rating'].astype(float)

In [7]:
df['rating_class'] = df['Rating'].round().astype(int)

In [8]:
print("Distribution of rounded ratings:\n", df['rating_class'].value_counts())

Distribution of rounded ratings:
 6     2285
7     1789
5     1389
4     1014
8      700
3      350
2       92
9       83
10       3
1        2
Name: rating_class, dtype: int64


In [9]:
X = df[['Genre', 'Director', 'Actor 1', 'Year']]
y = df['Rating']
rating_class = df['rating_class']

In [13]:
df['Year'] = df['Year'].astype(str).str.extract(r'(\d{4})').astype(int)

In [14]:
X = df[['Genre', 'Director', 'Actor 1', 'Year']]
X_processed = preprocessor.fit_transform(X)

In [15]:
categorical_cols = ['Genre', 'Director', 'Actor 1']
numeric_cols = ['Year']

preprocessor = ColumnTransformer([
    ('categorical', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('numeric', SimpleImputer(strategy='mean'), numeric_cols)
])

X_processed = preprocessor.fit_transform(X)


In [17]:
rus = RandomUnderSampler(random_state=42)

In [18]:
X_resampled, rating_class_resampled = rus.fit_resample(X_processed, rating_class)

In [20]:
X_resampled, rating_class_resampled = rus.fit_resample(X_processed, rating_class)

resample_indices = rus.sample_indices_
y_resampled = y.iloc[resample_indices].reset_index(drop=True)


In [21]:


X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42)


In [22]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


RandomForestRegressor(random_state=42)

In [23]:
y_pred = model.predict(X_test)


In [24]:
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [25]:
print("\nModel Performance:")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")


Model Performance:
Mean Absolute Error: 3.63
Root Mean Squared Error: 3.98
R² Score: -0.27


In [28]:
print("\nModel Performance:")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")


Model Performance:
Mean Absolute Error: 3.63
Root Mean Squared Error: 3.98
R² Score: -0.27
