In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
raw_data = pd.read_csv('popular_10000_movies_tmdb.csv')

In [6]:
# Find størrelsen på datasættet
raw_data.shape

(10000, 14)

In [7]:
# Find kolonnernes datatyper
raw_data.dtypes

id                        int64
title                    object
release_date             object
genres                   object
original_language        object
vote_average            float64
vote_count                int64
popularity              float64
overview                 object
budget                    int64
production_companies     object
revenue                   int64
runtime                   int64
tagline                  object
dtype: object

In [8]:
# Vis de første rækker i datasættet
raw_data.head()

Unnamed: 0,id,title,release_date,genres,original_language,vote_average,vote_count,popularity,overview,budget,production_companies,revenue,runtime,tagline
0,758323,The Pope's Exorcist,2023-04-05,"['Horror', 'Mystery', 'Thriller']",English,7.4,619,5089.969,"Father Gabriele Amorth, Chief Exorcist of the ...",18000000,"['Screen Gems', '2.0 Entertainment', 'Jesus & ...",65675816,103,Inspired by the actual files of Father Gabriel...
1,640146,Ant-Man and the Wasp: Quantumania,2023-02-15,"['Action', 'Adventure', 'Science Fiction']",English,6.6,2294,4665.438,Super-Hero partners Scott Lang and Hope van Dy...,200000000,"['Marvel Studios', 'Kevin Feige Productions']",464566092,125,Witness the beginning of a new dynasty.
2,502356,The Super Mario Bros. Movie,2023-04-05,"['Animation', 'Adventure', 'Family', 'Fantasy'...",English,7.5,1861,3935.55,"While working underground to fix a water main,...",100000000,"['Universal Pictures', 'Illumination', 'Ninten...",1121048165,92,
3,868759,Ghosted,2023-04-18,"['Action', 'Comedy', 'Romance']",English,7.2,652,2791.532,Salt-of-the-earth Cole falls head over heels f...,0,"['Skydance Media', 'Apple Studios']",0,120,Finding that special someone can be a real adv...
4,594767,Shazam! Fury of the Gods,2023-03-15,"['Action', 'Comedy', 'Fantasy', 'Adventure']",English,6.8,1510,2702.593,"Billy Batson and his foster siblings, who tran...",125000000,"['New Line Cinema', 'The Safran Company', 'DC ...",133437105,130,Oh. My. Gods.


In [9]:
# Undersøg, om der mangler data i nogle af kolonnerne
raw_data.isna().sum()

id                         0
title                      0
release_date              21
genres                     0
original_language          0
vote_average               0
vote_count                 0
popularity                 0
overview                  77
budget                     0
production_companies       0
revenue                    0
runtime                    0
tagline                 2759
dtype: int64

In [10]:
# Der mangler taglines på en hel del film.
# Der mangler et overview på 77 film.
# Der mangler en premieredato på 21 film.

# Vi starter med at se på de film, som mangler en premieredato.
raw_data.query("release_date.isna() == True")

Unnamed: 0,id,title,release_date,genres,original_language,vote_average,vote_count,popularity,overview,budget,production_companies,revenue,runtime,tagline
54,730629,John Wick: Chapter 5,,"['Action', 'Crime', 'Thriller']",English,0.0,0,297.57,The fifth installment in the John Wick franchi...,0,['Lionsgate'],0,0,
978,421892,Shrek 5,,"['Animation', 'Family', 'Comedy', 'Fantasy']",English,0.0,0,42.441,"Plot unknown. A soft-reboot of the animated ""S...",0,['DreamWorks Animation'],0,0,
1764,1108427,Moana,,"['Adventure', 'Comedy', 'Family', 'Fantasy', '...",English,0.0,0,26.751,"Like its animated predecessor, the film will c...",0,"['Walt Disney Pictures', 'Seven Bucks Producti...",0,0,
2328,1034541,Terrifier 3,,"['Horror', 'Fantasy', 'Thriller']",English,0.0,0,22.88,"In the aftermath of Terrifier 2, the third ins...",0,"['Dark Age Cinema', 'Fuzz on the Lens Producti...",0,0,
2582,553301,Your Name,,"['Drama', 'Fantasy', 'Romance']",English,0.0,0,19.772,Two teenagers discover they are magically and ...,0,['Bad Robot'],0,0,
2605,631969,Kingsman: The Blue Blood,,"['Action', 'Comedy', 'Adventure']",English,0.0,0,20.247,The upcoming fourth installment in the Kingsma...,0,"['Marv Films', '20th Century Studios']",0,0,
2663,504075,Wolf Warrior 3,,"['Action', 'War']",Chinese,0.0,0,23.7,The third movie about a Chinese special force ...,0,[],0,90,
2756,1084244,Toy Story 5,,"['Animation', 'Adventure', 'Family']",English,0.0,0,20.355,The next entry in the Toy Story franchise. Plo...,0,"['Pixar', 'Walt Disney Pictures']",0,0,
2853,921145,Road to Wadesboro- Evil Dead 2 Locations,,['Horror'],German,6.1,4,21.386,,0,[],0,0,
3888,755679,Fast X: Part 2,,"['Action', 'Crime']",English,0.0,0,21.747,The eleventh and final installment in The Fast...,0,"['Universal Pictures', 'One Race', 'Original F...",0,0,


In [11]:
# Et par hurtige stikprøver tyder på, at de film, hvor premieredatoen ikke er angivet, ikke er udkommet endnu, og at den manglende dato skyldes det.
# For at illustrere funktionaliteten, vælger vi at fjerne de film, som endnu ikke er udkommet, fra datasættet.
data = raw_data.drop(raw_data.query("release_date.isna() == True").index)

In [12]:
# Vi tjekker, at de er blevet fjernet.
data.shape

(9979, 14)

In [13]:
data.isna().sum()

id                         0
title                      0
release_date               0
genres                     0
original_language          0
vote_average               0
vote_count                 0
popularity                 0
overview                  74
budget                     0
production_companies       0
revenue                    0
runtime                    0
tagline                 2739
dtype: int64

In [14]:
# De 21 film uden premieredato er ikke længere i datasættet.
# Det rensede datasæt er navngivet 'data', og det er det, vi arbejder videre med. 