In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [8]:
df_movie = pd.read_csv('/content/IMDb Movies India.csv', encoding='ISO-8859-1')

In [None]:
df_movie.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,...Aur Pyaar Ho Gaya,(1997),147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,...Yahaan,(2005),142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,?: A Question Mark,(2012),82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia


In [18]:
# Display the number of rows and columns in the DataFrame 'df_movie'.
df_movie.shape

(5659, 10)

In [None]:
df_movie.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5659 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      5659 non-null   object 
 1   Year      5659 non-null   object 
 2   Duration  5659 non-null   object 
 3   Genre     5659 non-null   object 
 4   Rating    5659 non-null   float64
 5   Votes     5659 non-null   object 
 6   Director  5659 non-null   object 
 7   Actor 1   5659 non-null   object 
 8   Actor 2   5659 non-null   object 
 9   Actor 3   5659 non-null   object 
dtypes: float64(1), object(9)
memory usage: 486.3+ KB


In [17]:
# Count and display the number of missing (null/NaN) values in each column of the 'df_movie' DataFrame.
df_movie.isnull().sum()

Unnamed: 0,0
Name,0
Year,0
Duration,0
Genre,0
Rating,0
Votes,0
Director,0
Actor 1,0
Actor 2,0
Actor 3,0


In [16]:
#Count the total number of duplicate rows in the 'df_movie' DataFrame.
df_movie.duplicated().sum()

np.int64(0)

In [15]:
# Remove all rows from the 'df_movie' DataFrame that contain any missing (NaN) values.
df_movie.dropna(inplace= True)

In [14]:
# Show the new shape of the 'df_movie' DataFrame after dropping rows with missing values.
df_movie.shape

(5659, 10)

In [19]:
#count and display the number of missing values.
df_movie.isnull().sum()

Unnamed: 0,0
Name,0
Year,0
Duration,0
Genre,0
Rating,0
Votes,0
Director,0
Actor 1,0
Actor 2,0
Actor 3,0


In [20]:
df_movie.columns

Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')

**DATA PREPROCESSING**

In [21]:
# Remove parentheses from the 'Year' column and convert it to integer type
df_movie['Year']= df_movie['Year'].str.replace(r'[()]','',regex=True).astype(int)

In [22]:
# Remove 'min' from 'Duration' column and convert it to numeric type
df_movie['Duration'] = pd.to_numeric(df_movie['Duration'].str.replace('min',''))

In [None]:
# Step 1: Split the 'Genre' column into lists
df_movie['Genre'] = df_movie['Genre'].str.split(', ')

# Step 2: Explode the lists into separate rows
df_movie = df_movie.explode('Genre')

# Step 3: Fill any missing 'Genre' values with the most frequent one
df_movie['Genre'].fillna(df_movie['Genre'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_movie['Genre'].fillna(df_movie['Genre'].mode()[0], inplace=True)


In [23]:
# Remove commas from the 'Votes' column and convert it to numeric type
df_movie['Votes']=pd.to_numeric(df_movie['Votes'].str.replace(',', ''))

In [24]:
# Display updated summary info of 'df_movie' including column types and non-null counts
df_movie.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5659 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      5659 non-null   object 
 1   Year      5659 non-null   int64  
 2   Duration  5659 non-null   int64  
 3   Genre     5659 non-null   object 
 4   Rating    5659 non-null   float64
 5   Votes     5659 non-null   int64  
 6   Director  5659 non-null   object 
 7   Actor 1   5659 non-null   object 
 8   Actor 2   5659 non-null   object 
 9   Actor 3   5659 non-null   object 
dtypes: float64(1), int64(3), object(6)
memory usage: 486.3+ KB


**DATA VISUALIZATION**

In [25]:
# Create and display a normalized histogram of movie counts by year using Plotly
year = px.histogram(df_movie, x ='Year',histnorm='probability density',nbins =30)
year.show()

In [None]:
# Group by Year and Genre to calculate average rating
avg_rating_by_year = df_movie.groupby(['Year','Genre'])['Rating'].mean().reset_index()

# Get top 10 most frequent genres
top_genres = df_movie['Genre'].value_counts().head(10).index

# Filter to include only top genres
average_rating_by_year = avg_rating_by_year[avg_rating_by_year['Genre'].isin(top_genres)]

# Plot line chart of average rating by year for top genres
fig = px.line(average_rating_by_year, x='Year', y='Rating', color='Genre')

# Add titles to the plot
fig.update_layout(title='Average Rating by Year for Top Genres', xaxis_title='Year', yaxis_title='Average Rating')

# Show the plot
fig.show()


In [26]:
# Create and display a styled histogram showing the distribution of movie ratings using Plotly
rating_fig = px.histogram(df_movie, x = 'Rating',histnorm = 'probability density',nbins =40)
rating_fig.update_layout(title='Distribution of Rating',title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20),xaxis_title='Rating',yaxis_title=' Probability Density')
rating_fig.show()

**FEATURED ENGINEERING**

In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_absolute_error, r2_score

In [29]:
# Drop the 'Name' column from the 'df_movie' DataFrame permanently
df_movie.drop('Name',axis =1,inplace=True)

In [30]:
# Encode 'Genre' by replacing each value with the mean rating of that genre
genre_mean_rating =df_movie.groupby('Genre')['Rating'].transform('mean')
df_movie['Genre_mean_rating'] = genre_mean_rating

# Encode 'Director' by replacing each value with the mean rating of their movies
director_mean_rating = df_movie.groupby('Director')['Rating'].transform('mean')
df_movie['Director_encoded']= director_mean_rating

# Encode 'Actor 1' by replacing each actor with the mean rating of movies they appeared in
actor1_mean_rating = df_movie.groupby('Actor 1')['Rating'].transform('mean')
df_movie['Actor1_encoded']= actor1_mean_rating

# Encode 'Actor 2' by the mean movie rating of each actor
actor2_mean_rating = df_movie.groupby('Actor 2')['Rating'].transform('mean')
df_movie['Actor2_encoded']= actor2_mean_rating

# Encode 'Actor 3' by the mean movie rating of each actor
actor3_mean_rating = df_movie.groupby('Actor 3')['Rating'].transform('mean')
df_movie['Actor3_encoded']= actor3_mean_rating


In [31]:
# Define feature matrix 'x' with selected columns and target variable 'y' as the movie 'Rating'
x= df_movie[['Year','Votes','Duration','Genre_mean_rating','Actor1_encoded','Actor2_encoded','Actor3_encoded']]
y = df_movie['Rating']

In [32]:
# Split the data into training and testing sets (80% train, 20% test) with a fixed random seed
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size =0.2,random_state=42)

**MODEL BUILDING**

In [33]:
# Initialize and train a Linear Regression model, then make predictions on the test set
Model =LinearRegression()
Model.fit(x_train,y_train)
Model_pred = Model.predict(x_test)


In [34]:
# Print evaluation metrics for the Linear Regression model using test data
print('The performance evaluation of Logistic Regression is below : ','\n')
print('Mean square error : ',mean_squared_error(y_test,Model_pred))
print('Mean absolute error : ',mean_absolute_error(y_test,Model_pred))
print('R2 score : ', r2_score(y_test,Model_pred))

The performance evaluation of Logistic Regression is below :  

Mean square error :  0.4975683021963814
Mean absolute error :  0.5259067325094096
R2 score :  0.7312974162293233


**MODEL TESTING**

In [35]:
# Display the first 5 rows of the feature matrix 'x' to inspect input data
x.head(5)

Unnamed: 0,Year,Votes,Duration,Genre_mean_rating,Actor1_encoded,Actor2_encoded,Actor3_encoded
1,2019,8,109,6.415521,6.85,7.0,7.0
3,2019,35,110,5.716822,5.42,4.4,4.45
5,1997,827,147,6.242222,4.788889,5.786667,5.872727
6,2005,1086,142,6.82,5.435,6.933333,6.5
8,2012,326,82,5.477778,5.6,5.883333,5.6


In [36]:
# Display the first 5 values of the target variable 'y' (movie ratings)
y.head(5)

Unnamed: 0,Rating
1,7.0
3,4.4
5,4.7
6,7.4
8,5.6


In [None]:

# Create the input data
data = {
    'Year': [2019],
    'Votes': [35],
    'Duration': [111],
    'Genre_mean_rating': [5.8],
    'Director_encoded': [4.5],
    'Actor1_encoded': [5.2],
    'Actor2_encoded': [4.4],
    'Actor3_encoded': [4.4]
}
trail = pd.DataFrame(data)

# Match the columns to the model's expected features
trail = trail[[col for col in Model.feature_names_in_ if col in trail.columns]]

# Predict the rating
rating_predicted = Model.predict(trail)

# Print the result
print("Predicted Rating:", rating_predicted[0])

Predicted Rating: 4.22622986440074
