# Movies Dataset Analysis

In [1]:
# Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt

plt.style.use('ggplot')

from matplotlib.pyplot import figure

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = [12, 8] # sets the size of the figure - [width, height]

In [3]:
# Reading the data into dataframe

In [4]:
movies = pd.read_csv("./movies.csv");

In [5]:
movies.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0


In [6]:
type(movies)

pandas.core.frame.DataFrame

In [7]:
movies.size

115020

In [8]:
movies.shape

(7668, 15)

In [9]:
movies.shape[0]

7668

In [10]:
len(movies)

7668

In [11]:
movies.loc[0, :]

name                          The Shining
rating                                  R
genre                               Drama
year                                 1980
released    June 13, 1980 (United States)
score                                 8.4
votes                            927000.0
director                  Stanley Kubrick
writer                       Stephen King
star                       Jack Nicholson
country                    United Kingdom
budget                         19000000.0
gross                          46998772.0
company                      Warner Bros.
runtime                             146.0
Name: 0, dtype: object

In [12]:
# Filling in missing values (NaN)

In [13]:
movies['budget']

0       19000000.0
1        4500000.0
2       18000000.0
3        3500000.0
4        6000000.0
           ...    
7663        7000.0
7664           NaN
7665       58750.0
7666       15000.0
7667           NaN
Name: budget, Length: 7668, dtype: float64

In [14]:
movies['budget'].isnull()[0:49]

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16     True
17    False
18    False
19     True
20    False
21    False
22    False
23    False
24     True
25     True
26     True
27    False
28    False
29    False
30     True
31     True
32     True
33    False
34     True
35    False
36    False
37    False
38     True
39     True
40     True
41     True
42    False
43     True
44     True
45     True
46    False
47    False
48    False
Name: budget, dtype: bool

In [15]:
movies.loc[16, 'budget']

nan

In [16]:
np.mean(movies['budget'].isnull())

0.2831246739697444

In [17]:
pd.isna(movies['name'])

0       False
1       False
2       False
3       False
4       False
        ...  
7663    False
7664    False
7665    False
7666    False
7667    False
Name: name, Length: 7668, dtype: bool

In [18]:
np.mean(movies['name'].isnull())

0.0

In [19]:
# Finding % of missing values in all columns

In [20]:
for col in movies.columns:
    missingPercent = np.mean(movies[col].isnull())
    print('{} - {}%'.format(col, missingPercent))

name - 0.0%
rating - 0.010041731872717789%
genre - 0.0%
year - 0.0%
released - 0.0002608242044861763%
score - 0.0003912363067292645%
votes - 0.0003912363067292645%
director - 0.0%
writer - 0.0003912363067292645%
star - 0.00013041210224308815%
country - 0.0003912363067292645%
budget - 0.2831246739697444%
gross - 0.02464788732394366%
company - 0.002217005738132499%
runtime - 0.0005216484089723526%


In [21]:
# Checking data types of movie attributes

In [22]:
movies.dtypes

name         object
rating       object
genre        object
year          int64
released     object
score       float64
votes       float64
director     object
writer       object
star         object
country      object
budget      float64
gross       float64
company      object
runtime     float64
dtype: object

In [23]:
# checking for empty(missing or null) values in 'budget' column

In [24]:
movies['budget'] == np.nan

0       False
1       False
2       False
3       False
4       False
        ...  
7663    False
7664    False
7665    False
7666    False
7667    False
Name: budget, Length: 7668, dtype: bool

In [25]:
# filling missing values in 'budget' column with 0

In [26]:
movies['budget'].fillna(0, inplace = True)

In [27]:
movies['rating']

0         R
1         R
2        PG
3        PG
4         R
       ... 
7663    NaN
7664    NaN
7665    NaN
7666    NaN
7667    NaN
Name: rating, Length: 7668, dtype: object

In [28]:
# unique values and their count in 'rating' column

In [29]:
movies['rating'].value_counts()

rating
R            3697
PG-13        2112
PG           1252
Not Rated     283
G             153
Unrated        52
NC-17          23
TV-MA           9
TV-PG           5
X               3
Approved        1
TV-14           1
Name: count, dtype: int64

In [30]:
# filling missing values in 'rating' column with 'Unrated'

In [31]:
movies['rating'].fillna('Unrated', inplace = True)

In [32]:
# unique values and their count in 'released' column

In [33]:
movies['released'].value_counts()

released
February 14, 1986 (United States)    9
May 17, 2019 (United States)         9
October 4, 1991 (United States)      9
August 26, 2016 (United States)      8
October 11, 2002 (United States)     8
                                    ..
March 9, 2000 (Australia)            1
July 20, 2001 (United Kingdom)       1
October 11, 2000 (France)            1
May 10, 2001 (Australia)             1
August 19, 2020 (United States)      1
Name: count, Length: 3414, dtype: int64

In [34]:
# filling missing values in 'released' column with 'February 14, 1986 (United States)'

In [35]:
movies['released'].fillna('February 14, 1986 (United States)', inplace = True)

In [36]:
# unique values and their count in 'score' column

In [37]:
movies['score'].value_counts()

score
6.6    359
6.4    351
6.2    346
6.7    332
6.5    331
      ... 
2.8      2
9.3      1
2.6      1
2.3      1
9.0      1
Name: count, Length: 72, dtype: int64

In [38]:
# filling missing values in 'score' column with 6.6

In [39]:
movies['score'].fillna(6.6, inplace = True)

In [40]:
# unique values and their count in 'votes' column

In [41]:
movies['votes'].value_counts()

votes
13000.0     117
11000.0     116
14000.0     105
19000.0     102
15000.0     102
           ... 
270000.0      1
810000.0      1
639.0         1
390.0         1
7.0           1
Name: count, Length: 936, dtype: int64

In [42]:
# filling missing values in 'votes' column with 13000.0

In [43]:
movies['votes'].fillna(13000.0, inplace = True)

In [44]:
# unique values and their count in 'writer' column

In [45]:
movies['writer'].value_counts()

writer
Woody Allen            37
Stephen King           31
Luc Besson             26
John Hughes            25
William Shakespeare    15
                       ..
Daniel G. Sullivan      1
Bonnie Turner           1
Dick King-Smith         1
Helen Prejean           1
Pereko Mosia            1
Name: count, Length: 4535, dtype: int64

In [46]:
# filling missing values in 'writer' column with 'Woody Allen'

In [47]:
movies['writer'].fillna('Woody Allen', inplace = True)

In [48]:
# Finding faster way to populate missing values in all fields

In [49]:
type(movies['writer'].value_counts())

pandas.core.series.Series

In [50]:
np.max(movies['writer'].value_counts())

40

In [51]:
movies['writer'].value_counts().index[0]

'Woody Allen'

In [52]:
movies['writer'].value_counts().values[0]

40

In [53]:
# iterating over all columns, if a column contains nan(s), they are filled with the most repeating value in that column

In [54]:
for col in movies.columns:
    if(np.mean(movies[col].isnull()) > 0):
        movies[col].fillna(movies[col].value_counts().index[0], inplace = True)

In [55]:
# rechecking for nan(s) in movies df

In [56]:
movies.isnull()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7663,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
7664,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
7665,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
7666,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [57]:
# count of nan(s) per column 

In [58]:
movies.isna().sum()

name        0
rating      0
genre       0
year        0
released    0
score       0
votes       0
director    0
writer      0
star        0
country     0
budget      0
gross       0
company     0
runtime     0
dtype: int64

In [59]:
np.mean(movies.isnull())

0.0

In [60]:
movies.head(5)

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0


In [61]:
# splitting 'released' column into 'year' and 'country' columns

In [62]:
type(movies['released'])

pandas.core.series.Series

In [63]:
movies.loc[0, 'released']

'June 13, 1980 (United States)'

In [64]:
type(movies.loc[0, 'released'])

str

In [65]:
# Partially working approach

In [66]:
# movies.loc[0, 'released'].find('(')

In [67]:
# movies.loc[0, 'released'].find(')')

In [68]:
# movies.iloc[0, 4]

In [69]:
# movies.iloc[0, 4][(movies.loc[0, 'released'].find('(') + 1):(movies.loc[0, 'released'].find(')'))]

In [70]:
# movies['Country'] = None

In [71]:
# movies.head()

In [72]:
# movies['Country'] = movies['released'].str.split(pat = '(').str[1].str.split(pat = ')').str[0]

In [73]:
# movies['Country']

In [74]:
# len(movies.loc[0, 'Country'])

In [75]:
# movies.head()

In [76]:
# movies['date'] = None

In [77]:
# movies['date'] = movies['released'].str.split(pat = '(').str[0]

In [78]:
# movies.head()

In [79]:
# type(movies.loc[0, 'date'])

In [80]:
# movies['date'] = movies['date'].str[:-1]

In [81]:
# movies.loc[0, 'released']

In [82]:
# importing re module for regex functions

In [83]:
import re

In [84]:
# finding 4 consecutive digits in 'released' column's every cell to be used as a 'Year' column's cells

In [85]:
movies['Year'] = movies['released'].apply(lambda x: re.findall('\d{4}', x)).str[0]

In [86]:
# typecasting 'Year' column's values from 'str' to 'int'

In [87]:
movies['Year'] = movies['Year'].astype('int64')

In [88]:
np.sort(movies.Year)

array([1980, 1980, 1980, ..., 2020, 2020, 2020], dtype=int64)

In [89]:
movies['Year'].value_counts()

Year
2019    228
2015    213
1986    213
2011    212
2007    212
2003    207
1994    206
1998    205
2018    205
2001    205
2008    204
1991    204
1995    203
2017    203
2000    200
2013    200
2005    200
1993    200
1989    200
1988    199
1996    199
1997    198
2016    198
1990    197
2006    196
1999    196
1987    196
2014    196
2010    193
2002    193
2009    192
2012    192
2004    191
1992    184
1985    183
1984    157
1983    145
1982    128
1981    103
1980     80
2020     32
Name: count, dtype: int64

In [90]:
# movies[movies['Year'].isin(['(United', 'States)', '(Australia)'])]

In [91]:
movies.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime,Year
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0,1980
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0,1980
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0,1980
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0,1980
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0,1980


In [92]:
movies[movies.year != movies.Year]

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime,Year
8,Superman II,PG,Action,1980,"June 19, 1981 (United States)",6.8,101000.0,Richard Lester,Jerry Siegel,Gene Hackman,United States,54000000.0,108185706.0,Dovemead Films,127.0,1981
11,The Gods Must Be Crazy,PG,Adventure,1980,"October 26, 1984 (United States)",7.3,54000.0,Jamie Uys,Jamie Uys,N!xau,South Africa,5000000.0,30031783.0,C.A.T. Films,109.0,1984
21,Heaven's Gate,R,Adventure,1980,"April 24, 1981 (United States)",6.8,14000.0,Michael Cimino,Michael Cimino,Kris Kristofferson,United States,44000000.0,3484523.0,Partisan Productions,219.0,1981
33,Cattle Annie and Little Britches,PG,Drama,1980,"April 24, 1981 (United States)",6.1,604.0,Lamont Johnson,David Eyre,Scott Glenn,United States,5100000.0,534816.0,Cattle Annie Productions,97.0,1981
40,The Watcher in the Woods,PG,Family,1980,"October 9, 1981 (United States)",6.3,5700.0,John Hough,Brian Clemens,Bette Davis,United States,0.0,5000000.0,Walt Disney Productions,84.0,1981
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7525,Weathering with You,PG-13,Animation,2019,"January 17, 2020 (United States)",7.5,28000.0,Makoto Shinkai,Makoto Shinkai,Kotaro Daigo,Japan,11100000.0,193457467.0,"""Weathering With You"" Film Partners",112.0,2020
7580,Run with the Hunted,Not Rated,Crime,2019,"June 26, 2020 (United States)",5.2,735.0,John Swab,John Swab,Ron Perlman,United States,0.0,682.0,Roxwell Films,93.0,2020
7584,"Faith, Hope & Love",PG,Comedy,2019,"February 4, 2020 (Australia)",6.2,719.0,J.J. Englert,Robert Krantz,Peta Murgatroyd,United States,0.0,210091.0,Ellinas Multimedia,106.0,2020
7604,Mine 9,Not Rated,Drama,2019,"May 19, 2020 (Poland)",6.4,4400.0,Eddie Mensore,Eddie Mensore,Terry Serpico,United States,350000.0,226421.0,Emphatic Films,83.0,2020


In [93]:
movies = movies.drop('year', axis = 1)

In [94]:
movies.rename(columns = {'Year': 'year'}, inplace = True)

In [95]:
# fetching 'Country' from 'released'

In [96]:
movies['Country'] = movies['released'].apply(lambda x: re.findall('[(].*[)]', x)).str[0].str[1:-1]

In [97]:
movies['Country'][0]

'United States'

In [98]:
movies[movies.country != movies.Country]

Unnamed: 0,name,rating,genre,released,score,votes,director,writer,star,country,budget,gross,company,runtime,year,Country
0,The Shining,R,Drama,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0,1980,United States
11,The Gods Must Be Crazy,PG,Adventure,"October 26, 1984 (United States)",7.3,54000.0,Jamie Uys,Jamie Uys,N!xau,South Africa,5000000.0,30031783.0,C.A.T. Films,109.0,1984,United States
20,Cruising,R,Crime,"February 15, 1980 (United States)",6.5,20000.0,William Friedkin,William Friedkin,Al Pacino,West Germany,11000000.0,19814523.0,Lorimar Film Entertainment,102.0,1980,United States
27,Raise the Titanic,PG,Action,"August 1, 1980 (United States)",5.0,4100.0,Jerry Jameson,Adam Kennedy,Jason Robards,United Kingdom,36000000.0,14000000.0,ITC Films,115.0,1980,United States
31,Prom Night,R,Horror,"July 18, 1980 (United States)",5.4,16000.0,Paul Lynch,William Gray,Leslie Nielsen,Canada,0.0,14796236.0,Guardian Trust Company,92.0,1980,United States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7653,The Quarry,R,Crime,"April 17, 2020 (Mexico)",5.4,2400.0,Scott Teems,Scott Teems,Shea Whigham,United States,0.0,3661.0,Prowess Pictures,98.0,2020,Mexico
7655,Legend of Deification,TV-PG,Animation,"October 1, 2020 (United States)",6.6,1300.0,Teng Cheng,Woody Allen,Guangtao Jiang,China,0.0,240663149.0,Beijing Enlight Pictures,110.0,2020,United States
7657,Leap,Unrated,Drama,"September 25, 2020 (United States)",6.7,903.0,Peter Ho-Sun Chan,Ji Zhang,Gong Li,China,0.0,25818882.0,Universal Pictures,135.0,2020,United States
7665,Saving Mbango,Unrated,Drama,"April 27, 2020 (Cameroon)",5.7,29.0,Nkanya Nkwai,Lynno Lovert,Onyama Laura,United States,58750.0,14000000.0,Embi Productions,97.0,2020,Cameroon


In [99]:
movies = movies.drop('country', axis = 1)

In [100]:
movies['Country'].value_counts()

Country
United States     6737
United Kingdom     197
France             148
Australia           48
Germany             46
                  ... 
Austria              1
Soviet Union         1
Uruguay              1
Yugoslavia           1
Cameroon             1
Name: count, Length: 61, dtype: int64

In [101]:
movies.rename(columns = {'Country': 'country'}, inplace = True)

In [102]:
movies

Unnamed: 0,name,rating,genre,released,score,votes,director,writer,star,budget,gross,company,runtime,year,country
0,The Shining,R,Drama,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,19000000.0,46998772.0,Warner Bros.,146.0,1980,United States
1,The Blue Lagoon,R,Adventure,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,4500000.0,58853106.0,Columbia Pictures,104.0,1980,United States
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,18000000.0,538375067.0,Lucasfilm,124.0,1980,United States
3,Airplane!,PG,Comedy,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,3500000.0,83453539.0,Paramount Pictures,88.0,1980,United States
4,Caddyshack,R,Comedy,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,6000000.0,39846344.0,Orion Pictures,98.0,1980,United States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7663,More to Life,Unrated,Drama,"October 23, 2020 (United States)",3.1,18.0,Joseph Ebanks,Joseph Ebanks,Shannon Bond,7000.0,14000000.0,Universal Pictures,90.0,2020,United States
7664,Dream Round,Unrated,Comedy,"February 7, 2020 (United States)",4.7,36.0,Dusty Dukatz,Lisa Huston,Michael Saquella,0.0,14000000.0,Cactus Blue Entertainment,90.0,2020,United States
7665,Saving Mbango,Unrated,Drama,"April 27, 2020 (Cameroon)",5.7,29.0,Nkanya Nkwai,Lynno Lovert,Onyama Laura,58750.0,14000000.0,Embi Productions,97.0,2020,Cameroon
7666,It's Just Us,Unrated,Drama,"October 1, 2020 (United States)",6.6,13000.0,James Randall,James Randall,Christina Roz,15000.0,14000000.0,Universal Pictures,120.0,2020,United States


In [103]:
# movies = movies.drop('released', axis = 1)

In [104]:
movies[(movies['genre'] == 'Comedy') & (movies['score'] > 6) & (movies['year'] >= 2018)]

Unnamed: 0,name,rating,genre,released,score,votes,director,writer,star,budget,gross,company,runtime,year,country
2219,Hyenas,Not Rated,Comedy,"April 26, 2019 (United States)",7.4,1000.0,Djibril Diop Mambéty,Friedrich Dürrenmatt,Mansour Diouf,0.0,24672.0,Thelma Film AG,110.0,2019,United States
7040,This Beautiful Fantastic,PG,Comedy,"February 19, 2018 (United Kingdom)",6.9,10000.0,Simon Aboud,Simon Aboud,Jessica Brown Findlay,0.0,353808.0,Ipso Facto Productions,92.0,2018,United Kingdom
7100,The Death of Stalin,R,Comedy,"March 9, 2018 (United States)",7.2,92000.0,Armando Iannucci,Armando Iannucci,Steve Buscemi,13000000.0,24646055.0,Quad Productions,107.0,2018,United States
7103,The Upside,PG-13,Comedy,"January 11, 2019 (United States)",7.0,49000.0,Neil Burger,Jon Hartmere,Kevin Hart,37500000.0,125856180.0,Escape Artists,126.0,2019,United States
7123,Thoroughbreds,R,Comedy,"March 9, 2018 (United States)",6.7,38000.0,Cory Finley,Cory Finley,Olivia Cooke,6000000.0,3187255.0,B Story,92.0,2018,United States
7249,A Simple Favor,R,Comedy,"September 14, 2018 (United States)",6.8,136000.0,Paul Feig,Darcey Bell,Anna Kendrick,20000000.0,97644617.0,BRON Studios,117.0,2018,United States
7256,Crazy Rich Asians,PG-13,Comedy,"August 15, 2018 (United States)",6.9,152000.0,Jon M. Chu,Peter Chiarelli,Constance Wu,30000000.0,238539198.0,Warner Bros.,120.0,2018,United States
7268,Mamma Mia! Here We Go Again,PG-13,Comedy,"July 20, 2018 (United States)",6.6,85000.0,Ol Parker,Ol Parker,Lily James,75000000.0,395618157.0,Universal Pictures,114.0,2018,United States
7270,Instant Family,PG-13,Comedy,"November 16, 2018 (United States)",7.3,97000.0,Sean Anders,Sean Anders,Mark Wahlberg,48000000.0,120556201.0,Closest to the Hole Productions,118.0,2018,United States
7272,Eighth Grade,R,Comedy,"August 3, 2018 (United States)",7.4,68000.0,Bo Burnham,Bo Burnham,Elsie Fisher,2000000.0,14347433.0,A24,93.0,2018,United States


In [105]:
movies

Unnamed: 0,name,rating,genre,released,score,votes,director,writer,star,budget,gross,company,runtime,year,country
0,The Shining,R,Drama,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,19000000.0,46998772.0,Warner Bros.,146.0,1980,United States
1,The Blue Lagoon,R,Adventure,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,4500000.0,58853106.0,Columbia Pictures,104.0,1980,United States
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,18000000.0,538375067.0,Lucasfilm,124.0,1980,United States
3,Airplane!,PG,Comedy,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,3500000.0,83453539.0,Paramount Pictures,88.0,1980,United States
4,Caddyshack,R,Comedy,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,6000000.0,39846344.0,Orion Pictures,98.0,1980,United States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7663,More to Life,Unrated,Drama,"October 23, 2020 (United States)",3.1,18.0,Joseph Ebanks,Joseph Ebanks,Shannon Bond,7000.0,14000000.0,Universal Pictures,90.0,2020,United States
7664,Dream Round,Unrated,Comedy,"February 7, 2020 (United States)",4.7,36.0,Dusty Dukatz,Lisa Huston,Michael Saquella,0.0,14000000.0,Cactus Blue Entertainment,90.0,2020,United States
7665,Saving Mbango,Unrated,Drama,"April 27, 2020 (Cameroon)",5.7,29.0,Nkanya Nkwai,Lynno Lovert,Onyama Laura,58750.0,14000000.0,Embi Productions,97.0,2020,Cameroon
7666,It's Just Us,Unrated,Drama,"October 1, 2020 (United States)",6.6,13000.0,James Randall,James Randall,Christina Roz,15000.0,14000000.0,Universal Pictures,120.0,2020,United States


In [106]:
movies.dtypes

name         object
rating       object
genre        object
released     object
score       float64
votes       float64
director     object
writer       object
star         object
budget      float64
gross       float64
company      object
runtime     float64
year          int64
country      object
dtype: object

In [107]:
# Typecasting

In [108]:
movies['budget'].astype('str').str[-1].value_counts()

budget
0    7668
Name: count, dtype: int64

In [109]:
movies['budget'] = movies['budget'].astype('int64')

In [110]:
movies['votes'].astype('str').str[-1].value_counts()

votes
0    7668
Name: count, dtype: int64

In [111]:
movies.votes = movies.votes.astype('int64')

In [112]:
movies['gross'].astype('str').str[-1].value_counts()

gross
0    7668
Name: count, dtype: int64

In [113]:
movies['gross'] = movies['gross'].astype('int64')

In [114]:
movies['runtime'].astype('str').str[-1].value_counts()

runtime
0    7668
Name: count, dtype: int64

In [115]:
movies['runtime'] = movies['runtime'].astype('int64')

In [116]:
# sorting 'gross' column in decreasing order to get an idea of movie revenue

In [117]:
movies.sort_values(by = ['gross'], inplace = False, ascending = False)

Unnamed: 0,name,rating,genre,released,score,votes,director,writer,star,budget,gross,company,runtime,year,country
5445,Avatar,PG-13,Action,"December 18, 2009 (United States)",7.8,1100000,James Cameron,James Cameron,Sam Worthington,237000000,2847246203,Twentieth Century Fox,162,2009,United States
7445,Avengers: Endgame,PG-13,Action,"April 26, 2019 (United States)",8.4,903000,Anthony Russo,Christopher Markus,Robert Downey Jr.,356000000,2797501328,Marvel Studios,181,2019,United States
3045,Titanic,PG-13,Drama,"December 19, 1997 (United States)",7.8,1100000,James Cameron,James Cameron,Leonardo DiCaprio,200000000,2201647264,Twentieth Century Fox,194,1997,United States
6663,Star Wars: Episode VII - The Force Awakens,PG-13,Action,"December 18, 2015 (United States)",7.8,876000,J.J. Abrams,Lawrence Kasdan,Daisy Ridley,245000000,2069521700,Lucasfilm,138,2015,United States
7244,Avengers: Infinity War,PG-13,Action,"April 27, 2018 (United States)",8.4,897000,Anthony Russo,Christopher Markus,Robert Downey Jr.,321000000,2048359754,Marvel Studios,149,2018,United States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3818,"Love, Honor and Obey",R,Comedy,"February 9, 2001 (United States)",6.5,5200,Dominic Anciano,Dominic Anciano,Sadie Frost,0,1400,British Broadcasting Corporation (BBC),103,2001,United States
7625,The Untold Story,Unrated,Comedy,"January 11, 2019 (United States)",5.7,320,Shane Stanley,Lee Stanley,Miko Hughes,0,790,Visual Arts Entertainment,104,2019,United States
7580,Run with the Hunted,Not Rated,Crime,"June 26, 2020 (United States)",5.2,735,John Swab,John Swab,Ron Perlman,0,682,Roxwell Films,93,2020,United States
2417,Madadayo,Unrated,Drama,"April 17, 1993 (Japan)",7.3,5100,Akira Kurosawa,Ishirô Honda,Tatsuo Matsumura,11900000,596,DENTSU Music And Entertainment,134,1993,Japan


In [118]:
# To get window of all records

pd.set_option('display.max_rows', None)

In [119]:
movies.head()

Unnamed: 0,name,rating,genre,released,score,votes,director,writer,star,budget,gross,company,runtime,year,country
0,The Shining,R,Drama,"June 13, 1980 (United States)",8.4,927000,Stanley Kubrick,Stephen King,Jack Nicholson,19000000,46998772,Warner Bros.,146,1980,United States
1,The Blue Lagoon,R,Adventure,"July 2, 1980 (United States)",5.8,65000,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,4500000,58853106,Columbia Pictures,104,1980,United States
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,"June 20, 1980 (United States)",8.7,1200000,Irvin Kershner,Leigh Brackett,Mark Hamill,18000000,538375067,Lucasfilm,124,1980,United States
3,Airplane!,PG,Comedy,"July 2, 1980 (United States)",7.7,221000,Jim Abrahams,Jim Abrahams,Robert Hays,3500000,83453539,Paramount Pictures,88,1980,United States
4,Caddyshack,R,Comedy,"July 25, 1980 (United States)",7.3,108000,Harold Ramis,Brian Doyle-Murray,Chevy Chase,6000000,39846344,Orion Pictures,98,1980,United States


In [120]:
movies['company'].sort_values(ascending = False).head()

7129                     thefyzz
5664                 micro_scope
6412    iDeal Partners Film Fund
4007                    i5 Films
6793                  i am OTHER
Name: company, dtype: object

In [127]:
movies['company'].drop_duplicates().sort_values(ascending = False).head()

7129                     thefyzz
5664                 micro_scope
6412    iDeal Partners Film Fund
4007                    i5 Films
6793                  i am OTHER
Name: company, dtype: object

In [128]:
movies['company'].drop_duplicates().head()

0          Warner Bros.
1     Columbia Pictures
2             Lucasfilm
3    Paramount Pictures
4        Orion Pictures
Name: company, dtype: object

In [123]:
movies.drop_duplicates().head()

Unnamed: 0,name,rating,genre,released,score,votes,director,writer,star,budget,gross,company,runtime,year,country
0,The Shining,R,Drama,"June 13, 1980 (United States)",8.4,927000,Stanley Kubrick,Stephen King,Jack Nicholson,19000000,46998772,Warner Bros.,146,1980,United States
1,The Blue Lagoon,R,Adventure,"July 2, 1980 (United States)",5.8,65000,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,4500000,58853106,Columbia Pictures,104,1980,United States
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,"June 20, 1980 (United States)",8.7,1200000,Irvin Kershner,Leigh Brackett,Mark Hamill,18000000,538375067,Lucasfilm,124,1980,United States
3,Airplane!,PG,Comedy,"July 2, 1980 (United States)",7.7,221000,Jim Abrahams,Jim Abrahams,Robert Hays,3500000,83453539,Paramount Pictures,88,1980,United States
4,Caddyshack,R,Comedy,"July 25, 1980 (United States)",7.3,108000,Harold Ramis,Brian Doyle-Murray,Chevy Chase,6000000,39846344,Orion Pictures,98,1980,United States


In [124]:
movies.head()

Unnamed: 0,name,rating,genre,released,score,votes,director,writer,star,budget,gross,company,runtime,year,country
0,The Shining,R,Drama,"June 13, 1980 (United States)",8.4,927000,Stanley Kubrick,Stephen King,Jack Nicholson,19000000,46998772,Warner Bros.,146,1980,United States
1,The Blue Lagoon,R,Adventure,"July 2, 1980 (United States)",5.8,65000,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,4500000,58853106,Columbia Pictures,104,1980,United States
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,"June 20, 1980 (United States)",8.7,1200000,Irvin Kershner,Leigh Brackett,Mark Hamill,18000000,538375067,Lucasfilm,124,1980,United States
3,Airplane!,PG,Comedy,"July 2, 1980 (United States)",7.7,221000,Jim Abrahams,Jim Abrahams,Robert Hays,3500000,83453539,Paramount Pictures,88,1980,United States
4,Caddyshack,R,Comedy,"July 25, 1980 (United States)",7.3,108000,Harold Ramis,Brian Doyle-Murray,Chevy Chase,6000000,39846344,Orion Pictures,98,1980,United States


In [125]:
len(movies)

7668

In [126]:
movies.to_csv('./cleanedMovies.csv')