TASK 2: MOVIE RATING PREDICTION WITH PYTHON

In [1]:
#importing important libraries
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
import plotly.express as px

Data collection and loading

In [2]:
movies=pd.read_csv('IMDb Movies India.csv')#importing dataset
movies.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),-2019.0,109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,-2021.0,90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,-2019.0,110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,-2010.0,105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


Data Visualization



In [3]:
year=px.histogram(movies,x='Year', histnorm='probability density' , nbins=30)
year.show()

In [4]:
# Group data by year and genre, calculate the 'average' rating
avg_rating_by_year = movies.groupby(['Year', 'Genre'])['Rating'].mean().reset_index()

# Get the top 10 genres
top_genres = avg_rating_by_year['Genre'].value_counts().head(10).index

# Filter the data to include only the top 3 genres
top_genres_data = avg_rating_by_year[avg_rating_by_year['Genre'].isin(top_genres)]

# Create the line plot with Plotly Express
fig = px.line(top_genres_data, x='Year', y='Rating', color='Genre')

# Update layout (title: 'Average Rating by Year for Top Genres', x-axis title: 'Year', y-axis title: 'Average Rating')
fig.update_layout(title='Average Rating by Year for Top Genres', xaxis_title='Year', yaxis_title='Average Rating')

# Show plot
fig.show()


In [5]:
year=px.histogram(movies,x='Rating', histnorm='probability density' , nbins=30)
year.update_layout(title='Distribution of Rating' ,xaxis_title='Rating' , yaxis_title='Probablity')
year.show()

Data preprocessing

In [6]:
movies.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [7]:
movies=movies.dropna() #droping missing rows

In [8]:
movies.isnull().sum()

Name        0
Year        0
Duration    0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64

In [9]:
print(movies.shape)
print(movies.size)

(5659, 10)
56590


In [10]:
movies.describe()

Unnamed: 0,Year,Rating
count,5659.0,5659.0
mean,-1996.24757,5.898533
std,19.741839,1.381165
min,-2021.0,1.1
25%,-2013.0,5.0
50%,-2002.0,6.1
75%,-1983.0,6.9
max,-1931.0,10.0


In [11]:
movies['Year']=movies['Year'].astype(int)
movies['Year']=movies['Year'].abs()

In [12]:
movies['Duration']=pd.to_numeric(movies['Duration'].str.replace('min', ''))

In [13]:
movies['Genre']=movies['Genre'].str.split(', ')
movies=movies.explode('Genre')
movies['Genre'].fillna(movies['Genre'].mode()[0], inplace=True)

In [14]:
movies['Votes']=pd.to_numeric(movies['Votes'].str.replace(',', ''))

In [15]:
movies=movies.drop(columns='Name') #because it doesn't impact the outcome 

Spliting the dataset into testing  and training data 

In [16]:
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_absolute_error,mean_squared_error,r2_score

In [17]:
# Grouping by 'Genre' with their average rating and then creating a new feature
movies['Genre_mean_rating'] = movies.groupby('Genre')['Rating'].transform('mean')

# Grouping by 'Director' with their average rating and then creating a new feature
movies['Director_mean_rating'] = movies.groupby('Director')['Rating'].transform('mean')

# Grouping by 'Actor_1' with their average rating and then creating a new feature
movies['Actor_1_mean_rating'] = movies.groupby('Actor 1')['Rating'].transform('mean')

# Grouping by 'Actor_2' with their average rating and then creating a new feature
movies['Actor_2_mean_rating'] = movies.groupby('Actor 2')['Rating'].transform('mean')

# Grouping by 'Actor_3' with their average rating and then creating a new feature
movies['Actor_3_mean_rating'] = movies.groupby('Actor 3')['Rating'].transform('mean')



In [18]:
x=movies[['Year', 'Votes','Duration','Genre_mean_rating','Director_mean_rating','Actor_1_mean_rating','Actor_2_mean_rating','Actor_3_mean_rating']] #predictor
y=movies['Rating']#target variable

In [19]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

Logistic regression and model training


In [20]:
model=LinearRegression()
model.fit(x_train,y_train)
model_pred=model.predict(x_test)

In [21]:
print('The performance evaluation of logistic regression is below : \n')
print('Mean squared error: ', mean_squared_error(y_test,model_pred))
print('Mean absolute error: ', mean_absolute_error(y_test,model_pred))
print('R2 score: ', r2_score(y_test, model_pred))

The performance evaluation of logistic regression is below : 

Mean squared error:  0.4465441653985702
Mean absolute error:  0.4921902540765641
R2 score:  0.7641133663863862


In [22]:
y.head(5)

1    7.0
3    4.4
3    4.4
5    4.7
5    4.7
Name: Rating, dtype: float64

In [23]:
x.head(5)

Unnamed: 0,Year,Votes,Duration,Genre_mean_rating,Director_mean_rating,Actor_1_mean_rating,Actor_2_mean_rating,Actor_3_mean_rating
1,2019,8,109,6.056744,7.0,6.85,7.0,7.0
3,2019,35,110,5.751042,4.4,5.25,4.4,4.46
3,2019,35,110,5.811087,4.4,5.25,4.4,4.46
5,1997,827,147,5.751042,5.335135,4.793617,5.73,5.93
5,1997,827,147,6.056744,5.335135,4.793617,5.73,5.93


In [26]:
data={'Year': [2019], 'Votes': [31], 'Duration': [110] , 'Genre_mean_rating': [4.15], 'Director_mean_rating': [4.40],'Actor_1_mean_rating': [5.25], 'Actor_2_mean_rating': [4.40] , 'Actor_3_mean_rating': [4.46]}
new_data=pd.DataFrame(data)

In [27]:
rating_prediction=model.predict(new_data)
print("Predicted Rating: ",rating_prediction)

Predicted Rating:  [4.05446545]
