In [1]:
# basic libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import warnings
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud

In [2]:
# avoiding warnings display
warnings.filterwarnings('ignore')

In [3]:
# loading dataset
data=pd.read_csv('movies.csv')
df=data.copy()
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'movies.csv'

In [None]:
df.drop('index',axis=1,inplace=True)

In [None]:
df.info()

**This is a very small dataset, comprised of details of 99 movies, it has 8 columns and 99 rows**

In [None]:
df.shape

In [None]:
df.isnull().sum()

> Only one missing value in gross total

In [None]:
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
# checking for duplicated row
df.duplicated().any()

**There is no duplicate row**

### Visualisation of categories

In [None]:
category=df['category'].value_counts(ascending=False)
name=category.index
count=category.values

# pie plot for this
plt.figure(figsize=(9,6))
sns.barplot(name,count)
plt.title('Category wise Distribution',fontdict={'family':'times new roman','size':20,'color':'red'})
plt.xlabel('Category',fontdict={'family':'times new roman','size':20,'color':'blue'})
plt.ylabel('Count',fontdict={'family':'times new roman','size':20,'color':'blue'})
plt.show()

In [None]:
# You can use these countplot, histplot directly to get distribution of a single categorical column
sns.countplot(df['category'])

In [None]:
sns.histplot(df['category'])

### Year 

In [None]:
year=df['year_of_release'].value_counts(ascending=False).head(10)
name=year.index
count=year.values

In [None]:
# Years with maximum release of movies
plt.figure(figsize=(12,5))
sns.barplot(name,count,palette='coolwarm')
plt.title('Top 10 years having most no of hit movies ',fontdict={'family':'times new roman','size':20,'color':'blue'})
plt.xlabel('Year',fontdict={'family':'times new roman','size':20,'color':'magenta'})
plt.xticks(rotation=90)
plt.ylabel('No of movies released',fontdict={'family':'times new roman','size':20,'color':'magenta'})
plt.show()

**Year 1995,1994,1999 had most number of hit movies**

In [None]:
# Get distribution of all records of Year_of_release
plt.figure(figsize=(15,6))
sns.countplot(df['year_of_release'])
plt.xticks(rotation=90)
plt.show()

### Runtime

In [None]:
runtime=df['run_time'].value_counts(ascending=False).head(10)
name=runtime.index
count=runtime.values
runtime

In [None]:
plt.figure(figsize=(7,7))
plt.pie(count,labels=name,rotatelabels=90)
cc=plt.Circle((0,0),0.5,color='white')
fig=plt.gcf()
fig.gca().add_artist(cc)
plt.title("Runtime Distribution",loc='right',fontdict={'family':'times new roman','size':20,'color':'blue'})
plt.show()

In [None]:
plt.figure(figsize=(12,5))
sns.barplot(name,count,palette='coolwarm')
plt.title('Run Time vs No of Release ',fontdict={'family':'times new roman','size':20,'color':'blue'})
plt.xlabel('Runtime',fontdict={'family':'times new roman','size':20,'color':'magenta'})
plt.xticks(rotation=90)
plt.ylabel('No of movies released',fontdict={'family':'times new roman','size':20,'color':'magenta'})
plt.show()

In [None]:
# countplot will give u better distribution of all records of run time
plt.figure(figsize=(15,5))
sns.countplot(df['run_time'])
plt.xticks(rotation=90)
plt.show()

In [None]:
### Movies with higest Run Time

df['run_time']=df['run_time'].str.replace("min","")
df = df.astype({'run_time':int})
print(df.dtypes)

In [None]:
higest_runtime_movie=df.sort_values(by='run_time',ascending=False).head(10)[['movie_name','run_time']]
higest_runtime_movie

In [None]:
name=higest_runtime_movie['movie_name']
value=higest_runtime_movie['run_time']

plt.figure(figsize=(12,5))
sns.barplot(name,value,palette='coolwarm')
plt.title('Top 10 movies with highest runtime ',fontdict={'family':'times new roman','size':20,'color':'blue'})
plt.xlabel('Movie Name',fontdict={'family':'times new roman','size':20,'color':'magenta'})
plt.xticks(rotation=80)
plt.ylabel('Runtime(min)',fontdict={'family':'times new roman','size':20,'color':'magenta'})
plt.show()

In [None]:
sns.lineplot(name,value,palette='coolwarm')
plt.xticks(rotation=90)
plt.show()

### Genre

In [None]:
# top 10 Genre
genre=df['genre'].value_counts(ascending=False).head(10)
name=genre.index
count=genre.values
genre

In [None]:
plt.figure(figsize=(7,7))
plt.pie(count,labels=name,rotatelabels=40,autopct='%1.2f')
cc=plt.Circle((0,0),0.5,color='white')
fig=plt.gcf()
fig.gca().add_artist(cc)
plt.title("Top 10 Genre",loc='right',fontdict={'family':'times new roman','size':20,'color':'blue'})
# plt.xticks(rotation=45)
plt.show()

In [None]:
# which Genre has the higest rating

genre_rating=df[['genre','imdb_rating']]
genre_rating=genre_rating.sort_values(by='imdb_rating',ascending=False).head(15)
genre_rating

> From above data We can see that most of highly rated movies falls into 'Drama' or 'Crime' followed by action or adventure

In [None]:
name=genre_rating['genre']
value=genre_rating['imdb_rating']

plt.figure(figsize=(12,5))
sns.barplot(name,value,palette='coolwarm')
plt.title('Top Genres by rating',fontdict={'family':'times new roman','size':20,'color':'blue'})
plt.xlabel('Genre',fontdict={'family':'times new roman','size':20,'color':'magenta'})
plt.xticks(rotation=80)
plt.ylabel('Ratings',fontdict={'family':'times new roman','size':20,'color':'magenta'})
plt.show()

In [None]:
plt.figure(figsize=(12,5))
sns.scatterplot(name,value,palette='coolwarm')
plt.xticks(rotation=80)
plt.show()

In [None]:
# higest rated movies

top_movie_ratings=df[['movie_name','imdb_rating']].sort_values(by='imdb_rating',ascending=False).head(10)
top_movie_ratings

In [None]:
name=top_movie_ratings['movie_name']
value=top_movie_ratings['imdb_rating']

plt.figure(figsize=(12,5))
sns.barplot(name,value,palette='viridis')
plt.title('Top movies by ratings',fontdict={'family':'times new roman','size':20,'color':'blue'})
plt.xlabel('Movie Name',fontdict={'family':'times new roman','size':20,'color':'magenta'})
plt.xticks(rotation=80)
plt.ylabel('Ratings',fontdict={'family':'times new roman','size':20,'color':'magenta'})
plt.show()

**The higest rated movie is 'the shawshank redemption' followed by 'The Godfather' etc**

In [None]:
# scatter plot will give u better idea of rating differences

plt.figure(figsize=(12,5))
sns.scatterplot(name,value,palette='viridis')
plt.title('Top movies by ratings',fontdict={'family':'times new roman','size':20,'color':'blue'})
plt.xlabel('Movie Name',fontdict={'family':'times new roman','size':20,'color':'magenta'})
plt.xticks(rotation=80)
plt.ylabel('Ratings',fontdict={'family':'times new roman','size':20,'color':'magenta'})
plt.show()

In [None]:
# line plot of movies vs ratings

plt.figure(figsize=(12,5))
sns.lineplot(name,value,palette='viridis')
plt.title('Top movies by ratings',fontdict={'family':'times new roman','size':20,'color':'blue'})
plt.xlabel('Movie Name',fontdict={'family':'times new roman','size':20,'color':'magenta'})
plt.xticks(rotation=80)
plt.ylabel('Ratings',fontdict={'family':'times new roman','size':20,'color':'magenta'})
plt.show()

### Votes

In [None]:
# making a separate dataset using three columns visible in code below
votes=df[['movie_name','category','votes']]
votes.dtypes
# before sorting we need to convert column "votes" to "numeric" type

>But we cannot change the data type of the column because it contains comma within the numbers first we need to remove these commas

In [None]:
votes['votes']=votes['votes'].str.replace(',','',)
votes['votes'][0:5]

In [None]:
# checking different sample by running this code many times
votes['votes'].sample(10)

In [None]:
# converting data type 
try:
    votes= votes.astype({'votes':int})
except:
     print('ValueError: could not convert')

> since values in "votes" coloum contains commas thats why we are unable to convert string to number type, We need to remove comma from the data

In [None]:
votes.dtypes

Since our column has been converted to integer type  we can sort values

In [None]:
# sort by total votes
top_voted=votes.sort_values(by='votes',ascending=False).head(15)
top_voted

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=top_voted['votes'],y=top_voted['movie_name'])
plt.title('Movies vs Voting',loc='left',fontdict={'family':'times new roman','size':20,'color':'blue'})
plt.xlabel('No of Votes(10^6)',fontdict={'family':'times new roman','size':20,'color':'magenta'})
plt.ylabel('Movie Name',fontdict={'family':'times new roman','size':20,'color':'magenta'})
# plt.xticks(rotation=90)
plt.show()

### Gross Total

In [None]:
# Movies with highest gross total

gross=df[['movie_name','gross_total']]
# first we need to convert column 'gross_total' to 'numeric' type
gross['gross_total']=gross['gross_total'].str.strip('$M')

In [None]:
gross['gross_total']=gross['gross_total'].astype(float)
# gross = gross.astype({'gross_total':int})
gross

In [None]:
# sort values
top_movies_by_gross=gross.sort_values(by='gross_total',ascending=False).head(15)
name=top_movies_by_gross['movie_name']
value=top_movies_by_gross['gross_total']

# bar plot
plt.figure(figsize=(12,7))
sns.barplot(data=top_movies_by_gross,y=name,x=value)
plt.title('Highest Earning Movies',fontdict={'family':'times new roman','size':20,'color':'blue'})
plt.xlabel('Gross total in Million',fontdict={'family':'times new roman','size':20,'color':'magenta'})
plt.ylabel('Movie Name',fontdict={'family':'times new roman','size':20,'color':'magenta'})
plt.show()

# This is the end of the Notebook,Thank you