# MOVIE RATING PREDICTION WITH PYTHON

CodSoft Data Science internship Task 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Reading Data

In [None]:
data = pd.read_csv('IMDb Movies India.csv', encoding='latin1')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data.dropna(subset=['Rating','Actor 1','Actor 2','Actor 3','Director','Genre'],inplace=True)

In [None]:
data.isna().sum()

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data['Year'] = data['Year'].str.strip('()').astype(int)

In [None]:
data['Votes'] = data['Votes'].str.replace(',','').astype(int)

In [None]:
data['Duration'] = data['Duration'].str.strip(' min')

In [None]:
data.head()

In [None]:
data['Duration_copy']= data['Duration']
mask = data['Duration'].isnull()
random_values = np.random.randint(90,190,size=mask.sum())
data['Duration'][mask] = random_values
orignal_duration = data.loc[~data['Duration_copy'].isnull(),'Duration_copy'].astype(int)
data['Duration'] = data['Duration'].astype(int)

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(15, 10))
plt.subplot(2, 2, 1)
sns.histplot(data=orignal_duration, bins=20, kde=True)
plt.xlabel('Duration in mintues')
plt.title('original distribution of duration of movies')
plt.subplot(2, 2, 2)
sns.histplot(data=data, x=data['Duration_copy'].fillna(orignal_duration.mean()).astype(int), bins=20, kde=True)
plt.xlabel('Duration in mintues')
plt.title('missing values filled with mean')
plt.subplot(2, 2, 3)
sns.histplot(data=data, x='Duration', bins=20, kde=True)
plt.xlabel('Duration in mintues')
plt.title('missing values filled with random values between 90 and 180')
plt.show()

In [None]:
data.drop(columns=['Duration_copy'],inplace=True)

In [None]:
data.info()

## Exploratory Data Analysis

In [None]:
top_movie = data.loc[data['Rating'].sort_values(ascending=False)[:10].index]
top_movie

In [None]:
data.groupby('Year').apply(pd.DataFrame.nlargest,n=1,columns=['Rating'])

In [None]:
sns.set_style('darkgrid')
data.groupby('Year')[['Rating']].mean().plot(figsize=(15,5))
plt.xlabel('Year')
plt.ylabel('Rating')
plt.title('Average movie ratings by year')
plt.xticks(np.arange(1917,2023,5))
plt.xlim(1917,2023)
plt.show()

In [None]:
fig,ax1 = plt.subplots(figsize=(15,6))
sns.lineplot(data=data,x='Year',y='Votes',errorbar=None,ax=ax1,label='Average Votes',color='#2ca02c')
ax1.set_xlabel('Year')
ax1.set_ylabel('Average Votes')
ax1.set_xlim(1917,2023)
ax1.set_ylim(0,10000)
ax1.set_xticks(np.arange(1917,2023,5))
ax2 = ax1.twinx()
sns.lineplot(data=data,x='Year',y='Rating',errorbar=None,ax=ax2,color='#17becf',label='Average Rating')
ax2.set_ylabel('Average Rating')
ax2.set_ylim(4,8)
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc='upper left')
plt.show()

In [None]:
sns.set_style('darkgrid')
data.groupby(['Year'])['Name'].count().plot(figsize=(15,5))
plt.xlabel('Year')
plt.ylabel('Number of movies')
plt.title('Number of movies released every year')
plt.ylim(0,250)
plt.xlim(1917,2023)
plt.xticks(np.arange(1917,2023,5))
plt.show()

In [None]:
sns.set_style('darkgrid')
fig,ax1 = plt.subplots(figsize=(15,6))
data.groupby(['Year'])['Name'].count().plot(ax=ax1,label='Number of movies')
ax1.set_xlabel('Year')
ax1.set_ylabel('Number of movies')
ax1.set_ylim(0,250)
ax1.set_xlim(1917,2023)
ax1.set_xticks(np.arange(1917,2023,5))
ax2=ax1.twinx()
data.groupby('Year')[['Rating']].mean().plot(ax=ax2,color='#17becf',label='Average rating')
ax2.set_ylabel('Average Rating')
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc='upper left')
plt.show()

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(15,5))
sns.lineplot(data=data,x='Rating',y='Votes',errorbar=None)
plt.xlabel('Rating')
plt.ylabel('Average Votes')
plt.xticks(np.arange(0,10.5,0.5))
plt.title('Average votes for each rating')
plt.show()

In [None]:
sns.set_style('darkgrid')
d = data.loc[(data['Rating']>8) & (data['Votes']>10000), ['Rating','Votes','Name']]
plt.figure(figsize=(15, 6))
ax=sns.barplot(data=d,x='Name',y='Votes',hue='Rating',dodge=False,width=0.5,palette='muted')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha='right')
ax.legend(loc='upper right')
ax.set_xlabel('Movie Name')
ax.set_ylabel('Votes')
ax.set_title('Movies with rating greater than 8 and votes greater than 10000')
plt.show()

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(15, 6))
sns.lineplot(data=data,x='Year',y='Duration',errorbar=None)
plt.xlabel('Year')
plt.ylabel('Duration in minutes')
plt.title('Duration of movies by year')
plt.xticks(np.arange(1917,2023,5))
plt.show()

In [None]:
fig,ax1 = plt.subplots(figsize=(15,6))
sns.lineplot(data=data,x='Year',y='Duration',errorbar=None,ax=ax1,label='Average Duration')
ax1.set_xlabel('Year')
ax1.set_ylabel('Average Duration')
ax1.set_xlim(1917,2023)
ax1.set_xticks(np.arange(1917,2023,5))
ax2 = ax1.twinx()
sns.lineplot(data=data,x='Year',y='Rating',errorbar=None,ax=ax2,color='red',label='Average Rating')
ax2.set_ylabel('Average Rating')
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc='upper right')
plt.show()

In [None]:
genre_data = data.groupby('Genre').agg({'Rating':['mean','count']})
genre_data.reset_index(inplace=True)
genre_data.columns = ['Genre','Average Rating','Movie Count']
genre_data['Average Rating'] = genre_data['Average Rating'].round(1)
genre_data

In [None]:
genre_dict = dict(zip(genre_data['Genre'],genre_data['Average Rating']))

In [None]:
directors  = data.groupby('Director').agg({'Rating':['mean','count']})
directors.columns = directors.columns.droplevel(0)
directors.reset_index(inplace=True)
directors.columns = ['Director','Average Rating','Movie count']
directors['Average Rating'] = directors['Average Rating'].round(1)
directors.sort_values(by='Movie count',ascending=False,inplace=True)
directors.head()

In [None]:
directors_dict = dict(zip(directors['Director'],directors['Average Rating']))

In [None]:
data_melted = data.melt(id_vars='Rating', value_name='actor', var_name='role', value_vars=['Actor 1', 'Actor 2', 'Actor 3'])
actor_scores = data_melted.groupby('actor')['Rating'].agg(['mean', 'count'])
actor_scores.reset_index(inplace=True)
actor_scores.columns = ['Actor','Average Score', 'Number of movies']
actor_scores.sort_values('Number of movies', ascending=False, inplace=True)
actor_scores['Average Score']=actor_scores['Average Score'].round(1)
actor_scores

In [None]:
actor_score_dict = dict(zip(actor_scores['Actor'], actor_scores['Average Score']))

## Data Preprocessing

In [None]:
num_columns = list(data.select_dtypes(include=np.number).columns)
num=int(len(num_columns)/2) if int(len(num_columns)/2)>1 else 2
fig ,ax = plt.subplots(num,num,figsize=(12,10))
for j in range(num):
    for i in range(num):
        try:
            sns.histplot(data=data,x=num_columns[0],kde=True,bins=20,ax=ax[j][i])
            num_columns.pop(0)
        except:
            fig.delaxes(ax=ax[j][i])
fig.suptitle('Histograms of numerical columns', fontsize=16)
plt.show()

In [None]:
num_columns = list(data.select_dtypes(include=np.number).columns)
num=int(len(num_columns)/2) if int(len(num_columns)/2)>1 else 2
fig ,ax = plt.subplots(num,num,figsize=(12,10))
for j in range(num):
    for i in range(num):
        try:
            sns.boxplot(data=data,x=num_columns[0],ax=ax[j][i])
            num_columns.pop(0)
        except:
            fig.delaxes(ax=ax[j][i])
fig.suptitle('Boxplots to show outliers', fontsize=16)
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler,FunctionTransformer,RobustScaler,PowerTransformer,QuantileTransformer
num_data = data.select_dtypes(include=np.number)
num_data

In [None]:
pt = PowerTransformer()
num_data_pt = pd.DataFrame(pt.fit_transform(num_data),columns=num_data.columns)

## Model Building

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [None]:
data_2 = data.drop(['Name'],axis=1)
data_2['Genre'] = data_2['Genre'].map(genre_dict)
data_2['Director'] = data_2['Director'].map(directors_dict)
data_2['Actor 1'] = data_2['Actor 1'].map(actor_score_dict)
data_2['Actor 2'] = data_2['Actor 2'].map(actor_score_dict)
data_2['Actor 3'] = data_2['Actor 3'].map(actor_score_dict)
data_2

In [None]:
X=data_2.drop('Rating',axis=1)
y=data_2['Rating']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
print('Mean squared error: ',mean_squared_error(y_test,y_pred))
print('Mean absolute error: ',mean_absolute_error(y_test,y_pred))
print('R2 score: ',r2_score(y_test,y_pred))

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn import svm
X=data_2.drop('Rating',axis=1)
y=data_2['Rating']
# Assuming X and y are your data and labels
lr = LinearRegression()
scores = cross_val_score(lr, X, y, cv=5)
# print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))