In [2]:
import numpy as np
import pandas as pd

df = pd.read_csv('movies_2.csv')

# Check if data was imported.
if df.empty == False: print("Data was successfully imported. Check out what it is.")

Data was successfully imported. Check out what it is.


<h3>Getting to know the data</h3>

In [3]:
# Check the amount of data (variables = columns).
df.shape 

(616, 11)

In [4]:
# What are the names of the columns in the dataset.
df.columns

Index(['MovieID', 'Title', 'MPAA Rating', 'Budget', 'Gross', 'Release Date',
       'Genre', 'Runtime', 'Rating', 'Rating Count', 'Summary'],
      dtype='object')

In [5]:
# Take a look at the top and bottom rows
df.head()

Unnamed: 0,MovieID,Title,MPAA Rating,Budget,Gross,Release Date,Genre,Runtime,Rating,Rating Count,Summary
0,1,Look Who's Talking,PG-13,7500000.0,296000000.0,1989-10-12,Romance,93.0,5.9,73638.0,"After a single, career-minded woman is left on..."
1,2,Driving Miss Daisy,PG,7500000.0,145793296.0,1989-12-13,Comedy,99.0,7.4,91075.0,An old Jewish woman and her African-American c...
2,3,Turner & Hooch,PG,13000000.0,71079915.0,1989-07-28,Crime,100.0,7.2,91415.0,"Det. Scott Turner (Tom Hanks) is an uptight, b..."
3,4,Born on the Fourth of July,R,14000000.0,161001698.0,1989-12-20,War,145.0,7.2,91415.0,The biography of Ron Kovic. Paralyzed in the V...
4,5,Field of Dreams,PG,15000000.0,84431625.0,1989-04-21,Drama,107.0,7.5,101702.0,"An Iowa corn farmer, hearing voices, interpret..."


In [6]:
df.tail()

Unnamed: 0,MovieID,Title,MPAA Rating,Budget,Gross,Release Date,Genre,Runtime,Rating,Rating Count,Summary
611,612,Toy Story 4,G,200000000.0,1062000000.0,2019-06-11,Animation,100.0,,,
612,613,Fast & Furious Presents: Hobbs & Shaw,PG-13,200000000.0,759400000.0,2019-07-13,Thriller,136.0,,,
613,614,The Lion King,PG,250000000.0,1632000000.0,2019-07-09,Drama,118.0,,,
614,615,Avengers: Endgame,PG-13,356000000.0,2796000000.0,2019-04-22,Action,181.0,,,
615,0,,,,,,,,,,


In [7]:
# check how many non-null entries (None or NaN) are in the columns,
# also check the data types for mistyped data. Alternatively use
# df.dtypes if null-entries are not of interest.
# the MovieID with the value '0' does not count as null value -> null = NaN for int and float
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 616 entries, 0 to 615
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MovieID       616 non-null    int64  
 1   Title         615 non-null    object 
 2   MPAA Rating   615 non-null    object 
 3   Budget        615 non-null    float64
 4   Gross         615 non-null    float64
 5   Release Date  615 non-null    object 
 6   Genre         615 non-null    object 
 7   Runtime       615 non-null    float64
 8   Rating        508 non-null    float64
 9   Rating Count  508 non-null    float64
 10  Summary       496 non-null    object 
dtypes: float64(5), int64(1), object(5)
memory usage: 53.1+ KB


In [8]:
# Summary of numeric columns
df.describe()

Unnamed: 0,MovieID,Budget,Gross,Runtime,Rating,Rating Count
count,616.0,615.0,615.0,615.0,508.0,508.0
mean,307.5,94917100.0,445322100.0,118.642276,6.917323,339252.1
std,177.968162,67481140.0,339407500.0,22.252376,0.888928,321338.8
min,0.0,60000.0,53000000.0,79.0,4.1,14918.0
25%,153.75,40000000.0,215893900.0,102.0,6.4,127592.2
50%,307.5,80000000.0,351040400.0,117.0,6.9,240347.5
75%,461.25,140000000.0,585176600.0,132.0,7.6,425700.0
max,615.0,400000000.0,2796000000.0,201.0,9.0,2127228.0


In [9]:
# Get information on columns of type 'object' (non-numeric data).
df.describe(include=object)

# Results: there are possible duplicates in Title (only 609 are unique)
# also the mode (top) is present twice.
# Genre and MPAA Rating would benefit from conversion to 'category' to 
# minimise memory usage as they are groups of clearly limited individuals,
# also downsize from float64 and int64.
# only 4 unique values for MPAA Rating and 16 Genre of which Action is the most frequent

Unnamed: 0,Title,MPAA Rating,Release Date,Genre,Summary
count,615,615,615,615,496
unique,609,4,593,16,496
top,The Lion King,PG-13,1996-06-21,Action,"After a single, career-minded woman is left on..."
freq,2,285,2,110,1


In [10]:
# Rename columns to remove hidden white spaces.
df = df.rename(columns={"MPAA Rating": "MPAARating", "Release Date": "ReleaseDate", "Rating Count": "RatingCount"})

df.columns

Index(['MovieID', 'Title', 'MPAARating', 'Budget', 'Gross', 'ReleaseDate',
       'Genre', 'Runtime', 'Rating', 'RatingCount', 'Summary'],
      dtype='object')

In [11]:
# Check all distinct values for MPAA Rating.
print(df.MPAARating.unique())

# And all values for Genre.
print(df.Genre.unique())

# This also checks for misspellings that could be merged.

['PG-13' 'PG' 'R' 'G' nan]
['Romance' 'Comedy' 'Crime' 'War' 'Drama' 'Family' 'Action' 'Animation'
 'Science Fiction' 'Adventure' 'Thriller' 'Western' 'Horror' 'Mystery'
 'History' 'Fantasy' nan]


In [12]:
# Drop the summary column.
df.drop(['Summary'], axis=1, inplace=True)

# check drop
df.columns

Index(['MovieID', 'Title', 'MPAARating', 'Budget', 'Gross', 'ReleaseDate',
       'Genre', 'Runtime', 'Rating', 'RatingCount'],
      dtype='object')

In [13]:
# Show and compare duplicates in the dataset
pd.concat(g for _, g in df.groupby("Title") if len(g) > 1)

# Alternative: 
#titles = df["Title"]
#df[titles.isin(titles[titles.duplicated()])].sort_values("Title")

# g for (placeholder, g) in df.groupby('bla') if 'bla' : the underscore is a typical symbol for
# a placeholder of an inevitable argument where we don't want to use it for anthing in a lambda-
# like expression [stackoverflow] 

# result: all are remakes (different year) except Jurassic Park III

Unnamed: 0,MovieID,Title,MPAARating,Budget,Gross,ReleaseDate,Genre,Runtime,Rating,RatingCount
65,66,Aladdin,G,28000000.0,504050200.0,1992-11-25,Animation,90.0,8.0,336384.0
609,610,Aladdin,PG,183000000.0,1049000000.0,2019-05-08,Fantasy,128.0,,
46,47,Beauty and the Beast,PG,25000000.0,377350600.0,1991-11-13,Animation,84.0,8.0,397467.0
565,566,Beauty and the Beast,PG,160000000.0,1262886000.0,2017-03-16,Drama,129.0,,
195,196,Godzilla,PG-13,130000000.0,379014300.0,1998-05-20,Thriller,139.0,5.4,174600.0
504,505,Godzilla,PG-13,160000000.0,529076100.0,2014-05-14,Thriller,123.0,6.4,359438.0
98,99,Jurassic Park III,PG-13,93000000.0,368800000.0,2001-07-16,Thriller,92.0,8.9,1690474.0
249,250,Jurassic Park III,PG-13,93000000.0,368780800.0,2001-07-18,Adventure,92.0,5.9,280110.0
21,22,Teenage Mutant Ninja Turtles,PG,13500000.0,202000000.0,1990-03-30,Action,93.0,6.8,79806.0
501,502,Teenage Mutant Ninja Turtles,PG-13,125000000.0,477200000.0,2014-08-07,Action,101.0,5.8,194073.0


In [14]:
# Searching for duplicates could happen by comparing Title and Year.
# Create a new column for just the release year and search for duplicates
# with title + year and drop the ones where they match.

# ReleaseDate format is always the same, get the year in indices 0:4.
df['Year'] = df.ReleaseDate.str[:4]

# Check for duplicate titles where the release year is the same.
df[df.duplicated(['Title', 'Year'], keep=False)]


Unnamed: 0,MovieID,Title,MPAARating,Budget,Gross,ReleaseDate,Genre,Runtime,Rating,RatingCount,Year
98,99,Jurassic Park III,PG-13,93000000.0,368800000.0,2001-07-16,Thriller,92.0,8.9,1690474.0,2001
249,250,Jurassic Park III,PG-13,93000000.0,368780809.0,2001-07-18,Adventure,92.0,5.9,280110.0,2001


In [15]:
# manually dropping Jurassic Park III with the MovieID 250.
# Discrepacy with ReleaseDate: 07-16 USA opening, 07-18 general release
# Also Gross, Genre, and Rating.
# Keeping the movie with MovieID '99' because it has a higher RatingCount
df = df.drop(df.index[[249]])
#df.drop([249], axis=0, inplace=True)



In [16]:
# check if the duplicate was dropped
df[df.duplicated(['Title', 'Year'], keep=False)]

Unnamed: 0,MovieID,Title,MPAARating,Budget,Gross,ReleaseDate,Genre,Runtime,Rating,RatingCount,Year


In [17]:
# Also drop the Year column, as it is no longer needed.
df.drop(['Year'], axis=1, inplace=True)

Add code (round()) to make the float numbers with only two decimal places for monetary values.

In [18]:
# Change ReleaseDate to type datetime
df.loc[:, 'ReleaseDate'] = pd.to_datetime(df.ReleaseDate, yearfirst=True)

# Change Budget and Gross to type float
# unable to change to 'float' only 'float32' works
df.Budget = df.Budget.astype('float32')
    # Alternative tries
    #df['Budget'] = df['Budget'].astype(float)
    #df["Budget"] = pd.to_numeric(df["Budget"], downcast="float")

df.Gross = df.Gross.astype('float32')


In [19]:
# Check dtype has been changed
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 615 entries, 0 to 615
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   MovieID      615 non-null    int64         
 1   Title        614 non-null    object        
 2   MPAARating   614 non-null    object        
 3   Budget       614 non-null    float32       
 4   Gross        614 non-null    float32       
 5   ReleaseDate  614 non-null    datetime64[ns]
 6   Genre        614 non-null    object        
 7   Runtime      614 non-null    float64       
 8   Rating       507 non-null    float64       
 9   RatingCount  507 non-null    float64       
dtypes: datetime64[ns](1), float32(2), float64(3), int64(1), object(3)
memory usage: 48.0+ KB


In [20]:
# The last row contains no values and is dropped.
df = df.drop(df.index[[614]]) # index 614 b/c of dropped duplicate row Jurassic Park III
df.tail()

Unnamed: 0,MovieID,Title,MPAARating,Budget,Gross,ReleaseDate,Genre,Runtime,Rating,RatingCount
610,611,Godzilla: King of the Monsters,PG-13,200000000.0,385900000.0,2019-05-13,Adventure,132.0,,
611,612,Toy Story 4,G,200000000.0,1062000000.0,2019-06-11,Animation,100.0,,
612,613,Fast & Furious Presents: Hobbs & Shaw,PG-13,200000000.0,759400000.0,2019-07-13,Thriller,136.0,,
613,614,The Lion King,PG,250000000.0,1632000000.0,2019-07-09,Drama,118.0,,
614,615,Avengers: Endgame,PG-13,356000000.0,2796000000.0,2019-04-22,Action,181.0,,


In [21]:
# Use median to replace missing values in Rating and RatingCount
# (specified in exercise given).

df["Rating"].fillna(value=df["Rating"].median(), inplace=True)
df["RatingCount"].fillna(value=df["RatingCount"].median(), inplace=True)
df.tail()

Unnamed: 0,MovieID,Title,MPAARating,Budget,Gross,ReleaseDate,Genre,Runtime,Rating,RatingCount
610,611,Godzilla: King of the Monsters,PG-13,200000000.0,385900000.0,2019-05-13,Adventure,132.0,6.9,240160.0
611,612,Toy Story 4,G,200000000.0,1062000000.0,2019-06-11,Animation,100.0,6.9,240160.0
612,613,Fast & Furious Presents: Hobbs & Shaw,PG-13,200000000.0,759400000.0,2019-07-13,Thriller,136.0,6.9,240160.0
613,614,The Lion King,PG,250000000.0,1632000000.0,2019-07-09,Drama,118.0,6.9,240160.0
614,615,Avengers: Endgame,PG-13,356000000.0,2796000000.0,2019-04-22,Action,181.0,6.9,240160.0


Exercise text states there are no outliers.
But still, add a check for outliers here.

<h3>Questions asked by the exercise</h3>
<ul>
<li>show the movies with more than 7 in <b>Rating</b> & greater than 50 million <b>Gross</b></li>
<li>show the movies with more than 7 in <b>Rating</b> & greater than 50 million <b>Gross</b> & with Parental guidance as <b>MPAA Rating</b></li>
<li><b>count</b> of <b>Animation</b> movies with more than 7 in <b>Rating</b> (use the count() function)</li>
<li>show the list of <b>top 5 movies</b> based on Budget</li>
<li>show the <b>top 5 Comedy movies</b> approved by the audience (use Rating)</li>
<li>top 5 movie names by Rating</li>
<li>top 3 high Gross Romance movies produced after 2000 (for the date you can use a string)</li>
<li>how many Genres are present in the dataframe? (use the function value_counts() which applies to Series, not Dataframe)</li>
<li>top 5 expensive movies produced after 2000 (measured by Budget)</li>
<li>most & least frequent MPAA Rating in the dataset in terms of occurances</li>
<li>most and least expensive Genre (take an average of all Budget measures grouped by Genre - use groupBy() method)</li>
<li>which Genre is favored the most by the people?</li>
</ul>

In [22]:
# show the movies with more than 7 in Rating & greater than 50 million Gross
df.loc[(df['Rating'] > 7) & (df['Gross'] > 50000000)]

Unnamed: 0,MovieID,Title,MPAARating,Budget,Gross,ReleaseDate,Genre,Runtime,Rating,RatingCount
1,2,Driving Miss Daisy,PG,7500000.0,145793296.0,1989-12-13,Comedy,99.0,7.4,91075.0
2,3,Turner & Hooch,PG,13000000.0,71079912.0,1989-07-28,Crime,100.0,7.2,91415.0
3,4,Born on the Fourth of July,R,14000000.0,161001696.0,1989-12-20,War,145.0,7.2,91415.0
4,5,Field of Dreams,PG,15000000.0,84431624.0,1989-04-21,Drama,107.0,7.5,101702.0
6,7,When Harry Met Sally...,R,16000000.0,92800000.0,1989-07-21,Romance,96.0,7.6,180871.0
...,...,...,...,...,...,...,...,...,...,...
505,506,Big Hero 6,PG,165000000.0,652105472.0,2014-10-24,Animation,102.0,7.8,380953.0
506,507,Interstellar,PG-13,165000000.0,675120000.0,2014-11-05,Science Fiction,169.0,8.6,1343549.0
507,508,Captain America: The Winter Soldier,PG-13,170000000.0,714766592.0,2014-03-20,Action,136.0,7.7,685903.0
508,509,Dawn of the Planet of the Apes,PG-13,170000000.0,710644544.0,2014-06-26,Science Fiction,130.0,7.6,395425.0


In [23]:
# show the movies with more than 7 in Rating & greater than 50 million Gross & with Parental Guidance as MPAA Rating
df.loc[(df['Rating'] > 7) & (df['Gross'] > 50000000) & (df['MPAARating'] == "PG")]

Unnamed: 0,MovieID,Title,MPAARating,Budget,Gross,ReleaseDate,Genre,Runtime,Rating,RatingCount
1,2,Driving Miss Daisy,PG,7500000.0,145793300.0,1989-12-13,Comedy,99.0,7.4,91075.0
2,3,Turner & Hooch,PG,13000000.0,71079910.0,1989-07-28,Crime,100.0,7.2,91415.0
4,5,Field of Dreams,PG,15000000.0,84431620.0,1989-04-21,Drama,107.0,7.5,101702.0
7,8,Dead Poets Society,PG,16400000.0,235860100.0,1989-06-02,Drama,129.0,8.1,382002.0
13,14,Batman,PG,35000000.0,411348900.0,1989-06-23,Action,126.0,7.5,319517.0
15,16,The Little Mermaid,PG,40000000.0,222300000.0,1989-11-17,Animation,83.0,7.6,219221.0
16,17,Back to the Future Part II,PG,40000000.0,332000000.0,1989-11-20,Science Fiction,108.0,7.8,438940.0
19,20,Steel Magnolias,PG,15000000.0,95904090.0,1989-11-15,Drama,119.0,7.3,43037.0
24,25,Home Alone,PG,18000000.0,476684700.0,1990-11-09,Family,103.0,7.6,414472.0
32,33,The Hunt for Red October,PG,30000000.0,199200000.0,1990-03-02,Thriller,134.0,7.6,167212.0


In [24]:
# count of Animation movies with more than 7 in Rating (use the count() function)
print((df.loc[(df['Genre'] == ('Animation')) & (df['Rating'] > 7)]).count())

# TODO: return only one count

MovieID        39
Title          39
MPAARating     39
Budget         39
Gross          39
ReleaseDate    39
Genre          39
Runtime        39
Rating         39
RatingCount    39
dtype: int64


In [27]:
# show the list of top 5 movies based on Budget
top5_budget = df.sort_values(by='Budget', ascending=False)
top5_budget.head()

Unnamed: 0,MovieID,Title,MPAARating,Budget,Gross,ReleaseDate,Genre,Runtime,Rating,RatingCount
594,595,Avengers: Infinity War,PG-13,400000000.0,2048000000.0,2018-04-23,Action,149.0,6.9,240160.0
454,455,Pirates of the Caribbean: On Stranger Tides,PG-13,380000000.0,1045714000.0,2011-05-14,Action,136.0,6.6,455211.0
614,615,Avengers: Endgame,PG-13,356000000.0,2796000000.0,2019-04-22,Action,181.0,6.9,240160.0
574,575,Star Wars: The Last Jedi,PG-13,317000000.0,1333000000.0,2019-12-09,Science Fiction,152.0,6.9,240160.0
573,574,Justice League,PG-13,300000000.0,657900000.0,2017-11-15,Action,120.0,6.9,240160.0


In [28]:
# show the top 5 Comedy movies approved by the audience (use Rating)
df.loc[(df['Genre'] == 'Comedy')].nlargest(5,'Rating')

Unnamed: 0,MovieID,Title,MPAARating,Budget,Gross,ReleaseDate,Genre,Runtime,Rating,RatingCount
111,112,Forrest Gump,PG-13,55000000.0,677945408.0,1994-07-06,Comedy,142.0,8.8,1657851.0
185,186,The Truman Show,PG,60000000.0,264118208.0,1998-06-04,Comedy,103.0,8.1,859224.0
80,81,Groundhog Day,PG,14600000.0,70906976.0,1993-02-11,Comedy,101.0,8.0,549538.0
254,255,"Monsters, Inc.",G,115000000.0,562816256.0,2001-11-01,Comedy,92.0,8.0,758349.0
40,41,Fried Green Tomatoes,PG-13,11000000.0,119418504.0,1991-12-27,Comedy,130.0,7.7,62493.0


In [29]:
# top 5 movie names by Rating
top5_movies = df.sort_values(by='Rating', ascending=False)
top5_movies.head()

Unnamed: 0,MovieID,Title,MPAARating,Budget,Gross,ReleaseDate,Genre,Runtime,Rating,RatingCount
393,394,The Dark Knight,PG-13,185000000.0,1004558000.0,2008-07-16,Action,152.0,9.0,2127228.0
287,288,The Lord of the Rings: The Return of the King,PG-13,94000000.0,1118889000.0,2003-12-01,Fantasy,201.0,8.9,1529953.0
99,100,Pulp Fiction,R,8000000.0,213928800.0,1994-09-10,Thriller,154.0,8.9,1690474.0
83,84,Schindler's List,R,22000000.0,321365600.0,1993-11-29,History,195.0,8.9,1117322.0
98,99,Jurassic Park III,PG-13,93000000.0,368800000.0,2001-07-16,Thriller,92.0,8.9,1690474.0


In [34]:
# top 3 high Gross Romance movies produced after 2000 (for the date you can use a string)
# Previous part of the assignment converted ReleaseDate to datetime, we can't use a string!
df.loc[(df['Genre'] == 'Romance') & (df['ReleaseDate'] >= '2000-01-01')].nlargest(3,'Gross')

Unnamed: 0,MovieID,Title,MPAARating,Budget,Gross,ReleaseDate,Genre,Runtime,Rating,RatingCount
464,465,The Twilight Saga: Breaking Dawn - Part 2,PG-13,120000000.0,829000000.0,2012-11-13,Romance,115.0,5.5,218357.0
442,443,The Twilight Saga: Breaking Dawn - Part 1,PG-13,110000000.0,712171840.0,2011-03-15,Romance,117.0,4.9,211592.0
401,402,The Twilight Saga: New Moon,PG-13,50000000.0,709827456.0,2009-03-15,Romance,130.0,4.7,252223.0


In [47]:
# how many Genres are present in the dataframe? (use the function value_counts() which applies to Series, not Dataframe)

# With value_counts() as requested by the assignment
print(f'There are {len(list(df.Genre.value_counts()))} Genres present.')

# Shorter solution
print(f'There are {df.Genre.nunique()} Genres present.')

# List of how many films belong to each genre
pd.value_counts(pd.Series(df['Genre']))

There are 16 Genres present.
There are 16 Genres present.


Action             110
Comedy              99
Animation           87
Drama               66
Thriller            41
Science Fiction     37
Family              29
Adventure           29
Romance             28
Fantasy             27
Crime               17
Horror              14
Mystery             11
War                  9
Western              6
History              4
Name: Genre, dtype: int64

In [48]:
# top 5 expensive movies produced after 2000 (measured by Budget)
df.loc[(df['ReleaseDate'] >= '2000-01-01')].nlargest(5,'Budget')

Unnamed: 0,MovieID,Title,MPAARating,Budget,Gross,ReleaseDate,Genre,Runtime,Rating,RatingCount
594,595,Avengers: Infinity War,PG-13,400000000.0,2048000000.0,2018-04-23,Action,149.0,6.9,240160.0
454,455,Pirates of the Caribbean: On Stranger Tides,PG-13,380000000.0,1045714000.0,2011-05-14,Action,136.0,6.6,455211.0
614,615,Avengers: Endgame,PG-13,356000000.0,2796000000.0,2019-04-22,Action,181.0,6.9,240160.0
574,575,Star Wars: The Last Jedi,PG-13,317000000.0,1333000000.0,2019-12-09,Science Fiction,152.0,6.9,240160.0
375,376,Pirates of the Caribbean: At World's End,PG-13,300000000.0,961000000.0,2007-05-19,Adventure,169.0,7.1,565402.0


In [50]:
# most & least frequent MPAA Rating in the dataset in terms of occurances
df.MPAARating.value_counts(normalize=True) # proportion

PG-13    0.462541
PG       0.262215
R        0.231270
G        0.043974
Name: MPAARating, dtype: float64

In [51]:
# most and least expensive Genre (take an average of all Budget measures grouped by Genre - use groupBy() method)
df.groupby(['Genre'],as_index=False).Budget.mean().sort_values('Budget',ascending=False)

Unnamed: 0,Genre,Budget
7,Fantasy,161211104.0
0,Action,144168176.0
1,Adventure,135241376.0
12,Science Fiction,126783784.0
2,Animation,115879312.0
10,Mystery,103545456.0
14,War,90755552.0
13,Thriller,86585368.0
15,Western,79833336.0
6,Family,62689656.0


In [52]:
# which Genre is favored the most by the people?

#df.groupby(['Genre'],as_index=False).Rating.mean().sort_values('Rating',ascending=False)

Unnamed: 0,Genre,Rating
8,History,7.625
14,War,7.355556
7,Fantasy,7.337037
15,Western,7.333333
5,Drama,7.193939
12,Science Fiction,7.151351
2,Animation,7.110345
13,Thriller,6.995122
0,Action,6.985455
4,Crime,6.876471
