In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
csv_path = "IMDb Movies India.csv"
df = pd.read_csv(csv_path, encoding="ISO-8859-1")

# Drop rows where 'Rating' is missing (target variable)
df = df.dropna(subset=['Rating'])

# Convert 'Votes' column to numeric
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')

# Fill missing values for numerical columns
df['Votes'].fillna(df['Votes'].median(), inplace=True)
df['Duration'] = df['Duration'].str.replace(' min', '').astype(float)
df['Duration'].fillna(df['Duration'].median(), inplace=True)

df.drop(columns=['Name', 'Year'], inplace=True)  # Drop irrelevant columns

# Encode categorical variables
for col in ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']:
    df[col].fillna('Unknown', inplace=True)
    df[col] = LabelEncoder().fit_transform(df[col])

# Feature scaling
scaler = StandardScaler()
df[['Votes', 'Duration']] = scaler.fit_transform(df[['Votes', 'Duration']])

# Split dataset into features and target
X = df.drop(columns=['Rating'])
y = df['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Votes'].fillna(df['Votes'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Duration'].fillna(df['Duration'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which

Mean Absolute Error: 0.9917
Mean Squared Error: 1.5559
R-squared Score: 0.1631
