# 🎬 **IMDb Movies India Rating Prediction Project** 🎥

In [187]:
import numpy as np
import pandas as pd
import seaborn as sb
import plotly.express as px
import matplotlib.pyplot as mpl
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [188]:
# Load the dataset
df = pd.read_csv('/content/IMDb Movies India.csv', encoding='latin1')

In [189]:
# Summary statistics
df.describe()

Unnamed: 0,Rating
count,7919.0
mean,5.841621
std,1.381777
min,1.1
25%,4.9
50%,6.0
75%,6.8
max,10.0


In [190]:
# Data types
df.dtypes

Name         object
Year         object
Duration     object
Genre        object
Rating      float64
Votes        object
Director     object
Actor 1      object
Actor 2      object
Actor 3      object
dtype: object

In [191]:
# Missing values
df.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [192]:
# Shape of the dataset
df.shape

(15509, 10)

In [193]:
# Drop rows with missing values
df.dropna(inplace=True)

# Confirming no missing values
df.isnull().sum().sum()

0

In [194]:
# Shape after dropping missing values
df.shape

(5659, 10)

In [195]:
# Convert 'Duration' to numeric
df['Duration'] = df['Duration'].str.extract('(\d+)')
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')

In [196]:
# Clean and convert 'Year' to numeric
df['Year'] = df['Year'].str.extract('(\d+)')
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

In [197]:
# Convert 'Votes' to numeric
df['Votes'] = df['Votes'].str.replace(',', '').astype(int)

# Drop rows with NaN values after conversion
df.dropna(inplace=True)

In [198]:
# Extracting genres
genres = df['Genre'].str.split(', ', expand=True)

In [199]:
# Count occurrences of each genre
genre_counts = {}
for genre in genres.values.flatten():
    if genre is not None:
        if genre in genre_counts:
            genre_counts[genre] += 1
        else:
            genre_counts[genre] = 1

In [200]:
# Sorted genre counts
genre_counts = {genre: count for genre, count in sorted(genre_counts.items())}

# Display genre counts
for genre, count in genre_counts.items():
    print(f"{genre}: {count}")

Action: 1686
Adventure: 277
Animation: 40
Biography: 115
Comedy: 1344
Crime: 875
Documentary: 48
Drama: 3796
Family: 416
Fantasy: 146
History: 99
Horror: 202
Music: 53
Musical: 412
Mystery: 304
News: 1
Romance: 1380
Sci-Fi: 32
Sport: 40
Thriller: 679
War: 33
Western: 1


In [201]:
# Pie chart data
genres_pie = df['Genre'].value_counts()
genre_pie = pd.DataFrame(list(genres_pie.items()))
genre_pie = genre_pie.rename(columns={0: 'Genre', 1: 'Count'})

In [202]:
# Count unique directors
df["Director"].nunique()

2431

In [203]:
# Top directors
directors = df["Director"].value_counts()

# Top actors
actors = pd.concat([df['Actor 1'], df['Actor 2'], df['Actor 3']]).dropna().value_counts()

In [204]:
import warnings

# Disable all warnings
warnings.filterwarnings("ignore")

In [205]:
# Remove outliers in 'Duration'
Q1 = df['Duration'].quantile(0.25)
Q3 = df['Duration'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['Duration'] >= lower_bound) & (df['Duration'] <= upper_bound)]


In [206]:
# Pie chart of genres
genre_pie.loc[genre_pie['Count'] < 50, 'Genre'] = 'Other'
ax = px.pie(genre_pie, values='Count', names='Genre', title='More than one Genre of movies in Indian Cinema')
ax.show()

In [207]:
# Remove outliers in 'Rating'
Q1 = df['Rating'].quantile(0.25)
Q3 = df['Rating'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['Rating'] >= lower_bound) & (df['Rating'] <= upper_bound)]
df.head(16)

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,#Gadhvi (He thought he was Gandhi),2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,#Yaaram,2019,110,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,...Aur Pyaar Ho Gaya,1997,147,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,...Yahaan,2005,142,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,?: A Question Mark,2012,82,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia
9,@Andheri,2014,116,"Action, Crime, Thriller",4.0,11,Biju Bhaskar Nair,Augustine,Fathima Babu,Byon
10,1:1.6 An Ode to Lost Love,2004,96,Drama,6.2,17,Madhu Ambat,Rati Agnihotri,Gulshan Grover,Atul Kulkarni
11,1:13:7 Ek Tera Saath,2016,120,Horror,5.9,59,Arshad Siddiqui,Pankaj Berry,Anubhav Dhir,Hritu Dudani
12,100 Days,1991,161,"Horror, Romance, Thriller",6.5,983,Partho Ghosh,Jackie Shroff,Madhuri Dixit,Javed Jaffrey
13,100% Love,2012,166,"Comedy, Drama, Romance",5.7,512,Rabi Kinagi,Jeet,Koyel Mallick,Sujoy Ghosh


In [208]:
# Prepare data for modeling
df["Actor"] = df['Actor 1'] + ', ' + df['Actor 2'] + ', ' + df['Actor 3']
df["Directors"] = df['Director'].astype('category').cat.codes
df["Genres"] = df['Genre'].astype('category').cat.codes
df["Actors"] = df['Actor'].astype('category').cat.codes

In [209]:
# Remove outliers for 'Directors', 'Genres', and 'Actors'
for column in ['Directors', 'Genres', 'Actors']:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

In [210]:
# Prepare input and output for model
Input = df.drop(['Name', 'Genre', 'Rating', 'Director', 'Actor 1', 'Actor 2', 'Actor 3', 'Actor'], axis=1)
Output = df['Rating']

# Split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(Input, Output, test_size=0.2, random_state=1)

In [211]:
# Function to train and evaluate models
def train_and_evaluate_model(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

In [212]:
# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Gradient Boosting Regressor": GradientBoostingRegressor(),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "XGB Regressor": XGBRegressor(),
    "LGBM Regressor": LGBMRegressor()
}

In [213]:
# Evaluate models
for name, model in models.items():
    mse, r2 = train_and_evaluate_model(model, x_train, y_train, x_test, y_test)
    print(f"{name}: MSE = {mse}, R2 = {r2}")

Linear Regression: MSE = 1.5921823814454168, R2 = 0.1055810895637932
Random Forest Regressor: MSE = 1.0952555067750678, R2 = 0.38473302528966913
Gradient Boosting Regressor: MSE = 1.063635429435944, R2 = 0.4024958114196683
Decision Tree Regressor: MSE = 2.1994399277326107, R2 = -0.23554982560892723
XGB Regressor: MSE = 1.1393197998784028, R2 = 0.3599796192189101
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000437 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1182
[LightGBM] [Info] Number of data points in the train set: 4425, number of used features: 6
[LightGBM] [Info] Start training from score 5.904429
LGBM Regressor: MSE = 1.0208772531904358, R2 = 0.4265154977667992
