In [85]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import re

In [111]:
try:
    df = pd.read_csv('C:/Users/Akash/OneDrive - Erin.N.Nagarvala Day School/Desktop/jupyter notebook/IMDb Movies India.csv', encoding='latin1')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'IMDb Movies India.csv' not found. Please ensure the file is uploaded.")
    exit()

Dataset loaded successfully!


In [87]:
print("\n--- Original Dataset Head ---")
print(df.head())
print("\n--- Dataset Info ---")
df.info()
print("\n--- Missing Values Before Preprocessing ---")
print(df.isnull().sum())


--- Original Dataset Head ---
                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Anga

In [88]:
df.dropna(subset=['Rating'], inplace=True)
print(f"\nRows after dropping NaN 'Rating': {df.shape[0]}")


Rows after dropping NaN 'Rating': 7919


In [89]:
df['Director'] = df['Director'].fillna('Unknown')
df['Actor 1'] = df['Actor 1'].fillna('Unknown')
df['Actor 2'] = df['Actor 2'].fillna('Unknown')
df['Actor 3'] = df['Actor 3'].fillna('Unknown')
df['Genre'] = df['Genre'].fillna('Unknown')

In [90]:
df['Year'] = df['Year'].astype(str).str.extract(r'\((\d{4})\)').astype(float)
df.dropna(subset=['Year'], inplace=True) 
df['Year'] = df['Year'].astype(int)

In [91]:
df['Duration'] = df['Duration'].astype(str).str.replace(' min', '', regex=False)
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')
df.dropna(subset=['Duration'], inplace=True) 
df['Duration'] = df['Duration'].astype(int)

In [92]:
df['Votes'] = df['Votes'].astype(str).str.replace(',', '', regex=False)
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')
df.dropna(subset=['Votes'], inplace=True)
df['Votes'] = df['Votes'].astype(int)

In [93]:
all_genres = set()
for genres_str in df['Genre']:
    
    for genre in genres_str.split(','):
        all_genres.add(genre.strip())


In [94]:
for genre in sorted(list(all_genres)): # Sort for consistent column order
    df[f'Genre_{genre}'] = df['Genre'].apply(lambda x: 1 if genre in x.split(',') else 0)


In [70]:
print("\n--- Missing Values After Initial Preprocessing ---")
print(df.isnull().sum())
print("\n--- Preprocessed Dataset Head ---")
print(df.head())
print("\n--- Preprocessed Dataset Info ---")
df.info()


--- Missing Values After Initial Preprocessing ---
Year                 0
Duration             0
Rating               0
Votes                0
Director             0
Actor 1              0
Actor 2              0
Actor 3              0
Genre_Action         0
Genre_Adventure      0
Genre_Animation      0
Genre_Biography      0
Genre_Comedy         0
Genre_Crime          0
Genre_Documentary    0
Genre_Drama          0
Genre_Family         0
Genre_Fantasy        0
Genre_History        0
Genre_Horror         0
Genre_Music          0
Genre_Musical        0
Genre_Mystery        0
Genre_News           0
Genre_Romance        0
Genre_Sci-Fi         0
Genre_Sport          0
Genre_Thriller       0
Genre_Unknown        0
Genre_War            0
Genre_Western        0
dtype: int64

--- Preprocessed Dataset Head ---
   Year  Duration  Rating  Votes        Director          Actor 1  \
1  2019       109     7.0      8   Gaurav Bakshi     Rasika Dugal   
3  2019       110     4.4     35      Ovais Khan 

In [95]:
df.drop('Genre', axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)

print("\n--- Missing Values After Initial Preprocessing ---")
print(df.isnull().sum())
print("\n--- Preprocessed Dataset Head ---")
print(df.head())
print("\n--- Preprocessed Dataset Info ---")
df.info()


--- Missing Values After Initial Preprocessing ---
Year                 0
Duration             0
Rating               0
Votes                0
Director             0
Actor 1              0
Actor 2              0
Actor 3              0
Genre_Action         0
Genre_Adventure      0
Genre_Animation      0
Genre_Biography      0
Genre_Comedy         0
Genre_Crime          0
Genre_Documentary    0
Genre_Drama          0
Genre_Family         0
Genre_Fantasy        0
Genre_History        0
Genre_Horror         0
Genre_Music          0
Genre_Musical        0
Genre_Mystery        0
Genre_News           0
Genre_Romance        0
Genre_Sci-Fi         0
Genre_Sport          0
Genre_Thriller       0
Genre_Unknown        0
Genre_War            0
Genre_Western        0
dtype: int64

--- Preprocessed Dataset Head ---
   Year  Duration  Rating  Votes        Director          Actor 1  \
1  2019       109     7.0      8   Gaurav Bakshi     Rasika Dugal   
3  2019       110     4.4     35      Ovais Khan 

In [96]:
X = df.drop('Rating', axis=1)
y = df['Rating']

numerical_features = ['Year', 'Duration', 'Votes']
categorical_label_encode_features = ['Director', 'Actor 1', 'Actor 2', 'Actor 3']
genre_features = [col for col in X.columns if col.startswith('Genre_')]

In [97]:
preprocessor = ColumnTransformer(
    transformers=[
        ('label_encode', Pipeline([
            ('encoder', LabelEncoder())
        ]), categorical_label_encode_features),
        ('pass_through_num', 'passthrough', numerical_features),
        ('pass_through_genre', 'passthrough', genre_features)
    ],
    remainder='passthrough' 
)

In [99]:
for col in categorical_label_encode_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])


In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Testing target shape: {y_test.shape}")



Training features shape: (4680, 30)
Testing features shape: (1171, 30)
Training target shape: (4680,)
Testing target shape: (1171,)


In [102]:
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1) # n_jobs=-1 uses all available cores
model.fit(X_train, y_train)

print("\n--- Model Training Complete ---")
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nMean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R2) Score: {r2:.4f}")



--- Model Training Complete ---

Mean Absolute Error (MAE): 0.8488
R-squared (R2) Score: 0.3143


In [103]:
print("\n--- Example Prediction for a New Movie ---")
director_le = LabelEncoder()
actor1_le = LabelEncoder()
actor2_le = LabelEncoder()
actor3_le = LabelEncoder()

director_le.fit(df['Director'])
actor1_le.fit(df['Actor 1'])
actor2_le.fit(df['Actor 2'])
actor3_le.fit(df['Actor 3'])



--- Example Prediction for a New Movie ---


In [104]:
new_movie_values = {
    'Year': 2023,
    'Duration': 150,
    'Votes': 1000,
    'Director': director_le.transform(['Rajkumar Hirani'])[0] if 'Rajkumar Hirani' in director_le.classes_ else director_le.transform(['Unknown'])[0],
    'Actor 1': actor1_le.transform(['Shah Rukh Khan'])[0] if 'Shah Rukh Khan' in actor1_le.classes_ else actor1_le.transform(['Unknown'])[0],
    'Actor 2': actor2_le.transform(['Kareena Kapoor'])[0] if 'Kareena Kapoor' in actor2_le.classes_ else actor2_le.transform(['Unknown'])[0],
    'Actor 3': actor3_le.transform(['Boman Irani'])[0] if 'Boman Irani' in actor3_le.classes_ else actor3_le.transform(['Unknown'])[0],
}


In [106]:
for genre_col in genre_features:
    new_movie_values[genre_col] = 0


new_movie_values['Genre_Action'] = 1
new_movie_values['Genre_Drama'] = 1

In [107]:
single_movie_row = []
for col in X_train.columns:
    single_movie_row.append(new_movie_values.get(col, 0)) 


In [108]:
new_movie_data = pd.DataFrame([single_movie_row], columns=X_train.columns)
new_movie_data['Year'] = new_movie_data['Year'].astype(int)
new_movie_data['Duration'] = new_movie_data['Duration'].astype(int)
new_movie_data['Votes'] = new_movie_data['Votes'].astype(int)
for col in categorical_label_encode_features:
    new_movie_data[col] = new_movie_data[col].astype(int)

print(f"New Movie Data:\n{new_movie_data}")


New Movie Data:
   Year  Duration  Votes  Director  Actor 1  Actor 2  Actor 3  Genre_Action  \
0  2023       150   1000      1685     1624      871      474             1   

   Genre_Adventure  Genre_Animation  ...  Genre_Musical  Genre_Mystery  \
0                0                0  ...              0              0   

   Genre_News  Genre_Romance  Genre_Sci-Fi  Genre_Sport  Genre_Thriller  \
0           0              0             0            0               0   

   Genre_Unknown  Genre_War  Genre_Western  
0              0          0              0  

[1 rows x 30 columns]


In [110]:

predicted_rating = model.predict(new_movie_data)

print(f"Predicted Rating for the new movie: {predicted_rating[0]:.2f}")

Predicted Rating for the new movie: 5.99
