In [2]:
import numpy as np
import pandas as pd

## Introduction

Each cell will have the solution to the problems

In [4]:
# 1.1. Download the data set movie_metadata.csv, which contains data about films from IMDb
# (Internet Movie Database).
df = pd.read_csv("../files/movie_metadata.csv")

Exploring the dataframe

In [5]:
df.head(2)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0$,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0$,2007.0,5000.0,7.1,2.35,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      5024 non-null   object 
 1   director_name              4939 non-null   object 
 2   num_critic_for_reviews     4993 non-null   float64
 3   duration                   5028 non-null   float64
 4   director_facebook_likes    4939 non-null   float64
 5   actor_3_facebook_likes     5020 non-null   float64
 6   actor_2_name               5030 non-null   object 
 7   actor_1_facebook_likes     5036 non-null   float64
 8   gross                      4159 non-null   float64
 9   genres                     5043 non-null   object 
 10  actor_1_name               5036 non-null   object 
 11  movie_title                5043 non-null   object 
 12  num_voted_users            5043 non-null   int64  
 13  cast_total_facebook_likes  5043 non-null   int64

In [7]:
# 1.2. The duration column contains data on the film length. How many missing values are there  in this column?
null_values = df['duration'].isnull().sum()
print("1.2 Number NaNs in duration column:", null_values)

1.2 Number NaNs in duration column: 15


In [8]:
# 1.3. Replace the missing values in the duration column with the median value for this column.
df.loc[df['duration'].isna(),'duration'] = df['duration'].median()
new_null_values = df['duration'].isnull().sum()
print("1.3 Check there is no more NaNs in duration column", new_null_values)

1.3 Check there is no more NaNs in duration column 0


In [9]:
# 1.4. What is the average film length? Give the answer as a floating-point figure rounded to two decimal places.
avg_duration = df['duration'].mean().round(2)
print("1.4 Average duration of film", avg_duration)

1.4 Average duration of film 107.19


In [12]:
# 1.5. Create a movie_duration_category column, which will contain three categories
# depending on the film length:
#   • Category "1. <90" if the film is less than 90 minutes long
#   • Category "2. 90–120" if the film is between 90 minutes and two hours long (inclusively)
#   • Category "3. >120" if the film is more than two hours long
bins = [0, 90, 120, np.inf]
categories_names = ['<90', '90–120', '>120']
df['movie_duration_category'] = pd.cut(df['duration'], bins, labels=categories_names)
print("1.5 check")
df.sample(5)[['duration','movie_duration_category']]

1.5 check


Unnamed: 0,duration,movie_duration_category
4889,105.0,90–120
543,45.0,<90
3905,93.0,90–120
1353,98.0,90–120
4081,99.0,90–120


In [16]:
# 1.6. Build a summary table for films released after 2000 (inclusively), to list the numbers of
# films:
#   • Table rows: year
#   • Table columns: movie duration category ("<90", "90–120", ">120")
#   • The year of release should be displayed in the YYYY format.
df_films = df[df['title_year'] >= 2000].copy()
df_films['title_year'] = pd.to_datetime(df_films ['title_year'], format='%Y', errors='coerce').dt.year
df_films = df_films[['title_year', 'movie_duration_category']].pivot_table(index='title_year', columns=['movie_duration_category'], aggfunc=lambda x: len(x))
df_films.head(10)

movie_duration_category,<90,90–120,>120
title_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33,104,34
2001,36,113,39
2002,40,142,27
2003,37,102,30
2004,38,134,42
2005,40,133,48
2006,50,136,53
2007,37,124,43
2008,42,147,36
2009,55,165,40


In [18]:
# 1.7. How many films between 90 minutes and two hours long were released in 2008?
cond = (df['duration'].between(90,120, inclusive='both'))
cond &= df['title_year']==2008

print("1.7 films from 2008 that were between 90 and 120 minutes :", len(df[cond]))

1.7 films from 2008 that were between 90 and 120 minutes : 160


In [22]:
# 1.8. The plot_keywords column holds keywords characterizing the film's plot. Using the data
# in this column, create a column called movie_plot_category, to contain four categories
#   depending on the key words in the column:
#    • Category "love_and_death" if the keywords include both "love" and "death"
#    • Category "love" if the keywords include the word "love"
#    • Category "death" if the keywords include the word "death"
#    • Category "other" if the keywords do not meet the conditions above

# Using regex to determine love and death categories
categories = ['love','death']
df['plot_keywords'] = df['plot_keywords'].str.lower()
pat =  '('+'|'.join(categories)+')'
pat_ld = "love.*death|death.*love"
df['movie_plot_category'] = (df['plot_keywords'].str.extract(pat)).fillna('other')

#Love and death category
cond_ld = df['plot_keywords'].str.contains(pat_ld, regex=True, na=False)
df.loc[cond_ld, 'movie_plot_category'] = 'love_and_death'

# Check general movie_plot_category
df['movie_plot_category'].value_counts()

other             4621
love               234
death              172
love_and_death      16
Name: movie_plot_category, dtype: int64

In [27]:
# Check general movie_plot_category
print("Love category sample: ", df[df['movie_plot_category']=='love']['plot_keywords'].sample(1).iloc[0])
print("Love & death category sample: ", df[df['movie_plot_category']=='love_and_death']['plot_keywords'].sample(1).iloc[0])
print("other category sample: ", df[df['movie_plot_category']=='other']['plot_keywords'].sample(1).iloc[0])


Love category sample:  ballet|dance|dancer|dancing|love
Love & death category sample:  cancer|death|doctor|drinking|love
other category sample:  nan


In [29]:
# 1.9. The imdb_score column shows a viewer rating for the film. Build a table to reflect the
# average rating of films depending on which movie_plot_category category they belong to.

avg_score_df = df[['imdb_score','movie_plot_category']].groupby(['movie_plot_category']).mean().reset_index()\
    .rename(columns={'imdb_score':'mean_imdb_score'})
avg_score_df

Unnamed: 0,movie_plot_category,mean_imdb_score
0,death,6.535465
1,love,6.580769
2,love_and_death,6.50625
3,other,6.431422


In [32]:
# 1.10. What is the average rating of films in the "love" category? Give the answer as a floatingpoint
# figure rounded to two decimal places.

avg_score = avg_score_df[avg_score_df['movie_plot_category']=='love']['mean_imdb_score'].iloc[0]
print(f"The average rating films of love is {avg_score:.{2}f}")

The average rating films of love is 6.58


In [37]:
# 1.11. The budget column contains the film's budget. What is the median budget for all the films
# listed? Give the answer as an integer.

df['budget'] = df['budget'].map(lambda x: float(x.replace('$', '')))
median_budget = int(df['clean_budget'].median())
print(f"The median budget for all films is {median_budget} $")

The median budget for all films is 15000000 $
