# IMDB EDA

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
py.offline.init_notebook_mode(connected=True)
from plotly.offline import iplot
import cufflinks as cf
cf.go_offline(connected=True)
import warnings
warnings.filterwarnings("ignore")

## Loading the data

In [48]:
data=pd.read_csv("IMBD.csv")
data.head(10)

Unnamed: 0,movie,genre,runtime,certificate,rating,stars,description,votes,director
0,The Witcher,"Action, Adventure, Drama",60 min,A,8.1,"['Henry Cavill, ', 'Freya Allan, ', 'Anya Chal...","Geralt of Rivia, a solitary monster hunter, st...",539085.0,
1,Mission: Impossible - Dead Reckoning Part One,"Action, Adventure, Thriller",163 min,UA,8.0,"['Tom Cruise, ', 'Hayley Atwell, ', 'Ving Rham...",Ethan Hunt and his IMF team must track down a ...,106759.0,['Christopher McQuarrie']
2,Sound of Freedom,"Action, Biography, Drama",131 min,PG-13,7.9,"['Jim Caviezel, ', 'Mira Sorvino, ', 'Bill Cam...",The incredible true story of a former governme...,41808.0,['Alejandro Monteverde']
3,Secret Invasion,"Action, Adventure, Drama",47 min,UA 16+,6.2,"['Samuel L. Jackson, ', 'Emilia Clarke, ', 'Do...",Fury and Talos try to stop the Skrulls who hav...,40536.0,
4,Special Ops: Lioness,"Action, Drama, Thriller",,,7.5,"['Zoe Saldana, ', 'Laysla De Oliveira, ', 'Dav...",Joe attempts to balance her personal and profe...,3203.0,
5,They Cloned Tyrone,"Action, Comedy, Mystery",122 min,R,6.7,"['John Boyega, ', 'Jamie Foxx, ', 'Teyonah Par...",A series of eerie events thrusts an unlikely t...,14271.0,['Juel Taylor']
6,Star Trek: Strange New Worlds,"Action, Adventure, Sci-Fi",52 min,Not Rated,8.3,"['Anson Mount, ', 'Ethan Peck, ', 'Christina C...","A prequel to Star Trek: The Original Series, t...",45723.0,
7,One Piece,"Action, Adventure, Comedy",,,,"['Iñaki Godoy, ', 'Mackenyu, ', 'Emily Rudd, '...","In a seafaring world, a young pirate captain s...",,
8,Twisted Metal,"Action, Adventure, Comedy",30 min,,7.5,"['Anthony Mackie, ', 'Tahj Vaughans, ', 'Steph...",Follows a motor-mouthed outsider offered a cha...,4334.0,
9,The Flash,"Action, Adventure, Fantasy",144 min,UA,6.9,"['Ezra Miller, ', 'Michael Keaton, ', 'Sasha C...",Barry Allen uses his super speed to change the...,126445.0,['Andy Muschietti']


## Preprocessing

In [49]:
data.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129891 entries, 0 to 129890
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   movie        129891 non-null  object 
 1   genre        129891 non-null  object 
 2   runtime      109005 non-null  object 
 3   certificate  23850 non-null   object 
 4   rating       114381 non-null  float64
 5   stars        124676 non-null  object 
 6   description  129891 non-null  object 
 7   votes        114393 non-null  object 
 8   director     88611 non-null   object 
dtypes: float64(1), object(8)
memory usage: 90.8 MB


In [50]:
data.isna().sum()

movie               0
genre               0
runtime         20886
certificate    106041
rating          15510
stars            5215
description         0
votes           15498
director        41280
dtype: int64

In [51]:
data["runtime"]=data["runtime"].fillna("0 min")

In [52]:
data[~data["runtime"].str.isnumeric()].loc[:,["runtime"]]

Unnamed: 0,runtime
0,60 min
1,163 min
2,131 min
3,47 min
4,0 min
...,...
129886,10 min
129887,73 min
129888,60 min
129889,0 min


I think this list also includes TV series so runtime is kind of useless

## EDA
### Movie with Highest Votes and Rating

In [53]:
data["votes"].fillna("0",inplace=True)


In [54]:
data["votes"]=data["votes"].str.replace(",","")
data["votes"]

0         539085
1         106759
2          41808
3          40536
4           3203
           ...  
129886         0
129887         0
129888         0
129889         0
129890         0
Name: votes, Length: 129891, dtype: object

In [55]:
data["votes"]=data["votes"].str.strip()

In [56]:
data.loc[:,"votes"][data["votes"].str.isnumeric()==False]="0"


In [57]:
data["votes"]=data["votes"].astype("float").astype("Int64")

In [58]:
temp=data.sort_values(by="votes",ascending=False).reset_index(drop=True)
temp.drop_duplicates(inplace=True,subset="movie",ignore_index=True)
temp

Unnamed: 0,movie,genre,runtime,certificate,rating,stars,description,votes,director
0,Game of Thrones,"Action, Adventure, Drama",57 min,A,9.2,"['Emilia Clarke, ', 'Peter Dinklage, ', 'Kit H...",Nine noble families fight for control over the...,2186980,
1,Breaking Bad,"Crime, Drama, Thriller",49 min,18,9.5,"['Bryan Cranston, ', 'Aaron Paul, ', 'Anna Gun...",A chemistry teacher diagnosed with inoperable ...,2012179,
2,Stranger Things,"Drama, Fantasy, Horror",51 min,15,8.7,"['Millie Bobby Brown, ', 'Finn Wolfhard, ', 'W...","When a young boy disappears, his mother, a pol...",1261882,
3,The Walking Dead,"Drama, Horror, Thriller",44 min,18+,8.1,"['Andrew Lincoln, ', 'Norman Reedus, ', 'Melis...",Sheriff Deputy Rick Grimes wakes up from a com...,1038338,
4,Friends,"Comedy, Romance",22 min,13+,8.9,"['Jennifer Aniston, ', 'Courteney Cox, ', 'Lis...",Follows the personal and professional lives of...,1037385,
...,...,...,...,...,...,...,...,...,...
115983,Covers,"Drama, Musical",108 min,,,"['Tristan Mack Wilds, ', 'Malcolm M. Mays, ', ...",Dark Past. Bright Future. Sound Choices.,0,['Malcolm M. Mays']
115984,Ode to Harold,"Short, Musical",0 min,,,"['Mark Casimir Dyniewicz Jr.', '']",A series of reenacted musical numbers inspired...,0,
115985,Blondie of the Follies,"Comedy, Musical",91 min,,6.4,"['Marion Davies, ', 'Robert Montgomery, ', 'Bi...",Two young women find their friendship strained...,0,['Edmund Goulding']
115986,Sawan Mein Lag Gayi Aag,"Short, Musical",0 min,,,"['Gulshan Grover, ', 'Smriti Malhotra-Irani, '...",Add a Plot\n,0,


In [59]:
temp=temp.loc[:,["movie","votes","rating"]]
temp=temp.iloc[:15,:]
temp

Unnamed: 0,movie,votes,rating
0,Game of Thrones,2186980,9.2
1,Breaking Bad,2012179,9.5
2,Stranger Things,1261882,8.7
3,The Walking Dead,1038338,8.1
4,Friends,1037385,8.9
5,Sherlock,959882,9.1
6,The Big Bang Theory,835833,8.2
7,Chernobyl,810896,9.4
8,Dexter,743373,8.7
9,How I Met Your Mother,706458,8.3


In [60]:
temp.set_index("movie").loc[:,"votes"].iplot(kind="bar",xTitle="TV Series",yTitle="Votes",title="Most Votes TV Series")

ValueError: 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido


In [None]:
temp.set_index("movie").loc[:,"rating"].sort_values(ascending=False).iplot(kind="bar",title="Ratings of Most Voted TV Series",xTitle="Tv Series",yTitle="Rating")

### Movies by certificate

In [None]:
data["certificate"].value_counts().iloc[:15].iplot(kind="bar",title="Count of Certification",xTitle="Certificate",yTitle="Count",asImage=True)