In [19]:
#all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly 
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

In [20]:
#read in data set
df = pd.read_csv("movies.csv", encoding='latin-1')

#print out first 10 rows
df.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0


In [21]:
#shape of the data
df.shape

#Column Counts
df.columns.value_counts()



name        1
rating      1
genre       1
year        1
released    1
score       1
votes       1
director    1
writer      1
star        1
country     1
budget      1
gross       1
company     1
runtime     1
dtype: int64

In [22]:
print("The number of genres:")
df.genre.nunique() 

print("All the different genres:")
df.genre.unique()

The number of genres:
All the different genres:


array(['Drama', 'Adventure', 'Action', 'Comedy', 'Horror', 'Biography',
       'Crime', 'Fantasy', 'Family', 'Sci-Fi', 'Animation', 'Romance',
       'Music', 'Western', 'Thriller', 'History', 'Mystery', 'Sport',
       'Musical'], dtype=object)

In [23]:
print("Our top 3 genres are Comedy, Action, then Drama")
df.genre.value_counts(normalize=True)*100

Our top 3 genres are Comedy, Action, then Drama


Comedy       29.277517
Action       22.235263
Drama        19.796557
Crime         7.185707
Biography     5.777256
Adventure     5.568597
Animation     4.407929
Horror        4.199270
Fantasy       0.573813
Mystery       0.260824
Thriller      0.208659
Family        0.143453
Sci-Fi        0.130412
Romance       0.130412
Western       0.039124
Musical       0.026082
Music         0.013041
History       0.013041
Sport         0.013041
Name: genre, dtype: float64

In [24]:
fig = px.bar(data_frame=df.genre, x=df.genre.values, y=df.genre.index, labels={"x":"Genres","index":"Count"})
#fig.update_layout(xaxis={"categoryorder":"total descending"})

fig.show()

In [25]:
print("Our dataset has movies from 1980-2020")
df.year.unique()

df.year.value_counts()


df.year.nunique() 

Our dataset has movies from 1980-2020


41

In [26]:
print("The highest rated movie is a 9.3, while the lowest rated movie is a 1.")
df["score"].describe()

The highest rated movie is a 9.3, while the lowest rated movie is a 1.


count    7665.000000
mean        6.390411
std         0.968842
min         1.900000
25%         5.800000
50%         6.500000
75%         7.100000
max         9.300000
Name: score, dtype: float64

In [27]:
print("The distribution is negatively/left skewed")
fig = px.histogram(data_frame=df, x=df["score"], title="Movie Scores")
fig.show()

The distribution is negatively/left skewed


In [28]:
fig = px.box(data_frame=df, x=df["score"], 
hover_data=df[["name", "genre"]])
fig.update_traces(overwrite=False)


In [29]:
print("Highest rated movie")
df[df["score"] == df["score"].max()][["name", "genre","year"]]

Highest rated movie


Unnamed: 0,name,genre,year
2443,The Shawshank Redemption,Drama,1994


In [30]:
print("Lowest rated movie")
df[df["score"] == df["score"].min()][["name", "genre","year"]]

Lowest rated movie


Unnamed: 0,name,genre,year
4594,Superbabies: Baby Geniuses 2,Comedy,2004
5306,Disaster Movie,Comedy,2008
5354,The Hottie & the Nottie,Comedy,2008


In [31]:
print("Analysis based on runtime")
print("Runtime is  the time between the starting of the movie upto the end of the credits scene")
df.runtime.describe()

Analysis based on runtime
Runtime is  the time between the starting of the movie upto the end of the credits scene


count    7664.000000
mean      107.261613
std        18.581247
min        55.000000
25%        95.000000
50%       104.000000
75%       116.000000
max       366.000000
Name: runtime, dtype: float64

In [32]:
fig = px.histogram(data_frame=df, x="runtime", title="Runtime of Programs")
fig.show()

In [33]:
print("Movie with the highest runtime of 366.0")
df[df["runtime"] == df["runtime"].max()][["name","runtime", "genre","year"]]

Movie with the highest runtime of 366.0


Unnamed: 0,name,runtime,genre,year
4396,The Best of Youth,366.0,Drama,2003


In [34]:
print("Movie with the lowest runtime of 55.0")
df[df["runtime"] == df["runtime"].min()][["name","runtime", "genre","year"]]

Movie with the lowest runtime of 55.0


Unnamed: 0,name,runtime,genre,year
474,The Business of Show Business,55.0,History,1983
