<a href="https://colab.research.google.com/github/leonnmarcoo/CCADMACL_COM232_PROJECT/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Preprocessing**

## **Import**

In [1531]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast

from sklearn.impute import KNNImputer

import re

from pandas.plotting import scatter_matrix

## **Load the dataset**

In [1532]:
df = pd.read_csv("movie_dataset.csv")

## **Understanding the Data**

In [1533]:
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [1534]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

In [1535]:
df.nunique()

Unnamed: 0,0
index,4803
budget,436
genres,1168
homepage,1691
id,4803
keywords,4219
original_language,37
original_title,4801
overview,4800
popularity,4802


In [1536]:
df['status'].value_counts()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
Released,4795
Rumored,5
Post Production,3


In [1537]:
columns_to_drop = ['index', 'id', 'keywords', 'original_title', 'overview', 'spoken_languages', 'status', 'tagline', 'title', 'crew']
df.drop(columns=columns_to_drop, inplace=True)

In [1538]:
df.isnull().sum()

Unnamed: 0,0
budget,0
genres,28
homepage,3091
original_language,0
popularity,0
production_companies,0
production_countries,0
release_date,1
revenue,0
runtime,2


## **Data Cleaning**

In [1539]:
df['homepage'] = df['homepage'].notna().astype(int)
df['genres'] = df['genres'].str.split(' ')
df['production_companies'] = df['production_companies'].apply(
    lambda x: ', '.join([d['name'] for d in ast.literal_eval(x)])
)
df['production_countries'] = df['production_countries'].apply(
        lambda x: ', '.join([d['name'] for d in ast.literal_eval(x)])
)
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce').dt.year

def combine_first_last_names(name_string):
    if pd.isna(name_string) or not isinstance(name_string, str):
        return []
    words = name_string.split(' ')
    full_names = []
    i = 0
    while i < len(words):
        # Try to combine two words for a name
        if i + 1 < len(words):
            full_names.append(f"{words[i]} {words[i+1]}")
            i += 2
        else:
            # If only one word remains, treat it as a single-word name
            full_names.append(words[i])
            i += 1
    return full_names

df['cast'] = df['cast'].apply(combine_first_last_names)
df['director'] = df['director'].apply(combine_first_last_names)

In [1540]:
df.head()

Unnamed: 0,budget,genres,homepage,original_language,popularity,production_companies,production_countries,release_date,revenue,runtime,vote_average,vote_count,cast,director
0,237000000,"[Action, Adventure, Fantasy, Science, Fiction]",1,en,150.437577,"Ingenious Film Partners, Twentieth Century Fox...","United States of America, United Kingdom",2009.0,2787965087,162.0,7.2,11800,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,300000000,"[Adventure, Fantasy, Action]",1,en,139.082615,"Walt Disney Pictures, Jerry Bruckheimer Films,...",United States of America,2007.0,961000000,169.0,6.9,4500,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,245000000,"[Action, Adventure, Crime]",1,en,107.376788,"Columbia Pictures, Danjaq, B24","United Kingdom, United States of America",2015.0,880674609,148.0,6.3,4466,"[Daniel Craig, Christoph Waltz, L\u00e9a Seydo...",[Sam Mendes]
3,250000000,"[Action, Crime, Drama, Thriller]",1,en,112.31295,"Legendary Pictures, Warner Bros., DC Entertain...",United States of America,2012.0,1084939099,165.0,7.6,9106,"[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,260000000,"[Action, Adventure, Science, Fiction]",1,en,43.926995,Walt Disney Pictures,United States of America,2012.0,284139100,132.0,6.1,2124,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


### **Number of Genres**

In [1541]:
unique_count = df['genres'].explode().nunique()
print("Number of unique genres:", unique_count)

Number of unique genres: 22


In [1542]:
unique_genres = df['genres'].explode().unique()
print(unique_genres)

['Action' 'Adventure' 'Fantasy' 'Science' 'Fiction' 'Crime' 'Drama'
 'Thriller' 'Animation' 'Family' 'Western' 'Comedy' 'Romance' 'Horror'
 'Mystery' 'History' 'War' 'Music' 'Documentary' 'Foreign' 'TV' 'Movie'
 nan]


In [1543]:
genre_counts = df['genres'].explode().value_counts()
print(genre_counts)

genres
Drama          2297
Comedy         1722
Thriller       1259
Action         1153
Romance         890
Adventure       790
Crime           696
Science         530
Fiction         530
Horror          519
Family          510
Fantasy         418
Mystery         347
Animation       234
History         197
Music           183
War             142
Documentary     110
Western          80
Foreign          34
TV                8
Movie             8
Name: count, dtype: int64


In [1544]:
low_count_genres = genre_counts[genre_counts < 100].index.tolist()

# Filter the genres list for each row, removing low-count genres
df['genres'] = df['genres'].apply(
    lambda x: [genre for genre in x if genre not in low_count_genres] if isinstance(x, list) else []
)

print(f"Genres with counts less than 100 removed from 'genres' column: {low_count_genres}")

Genres with counts less than 100 removed from 'genres' column: ['Western', 'Foreign', 'TV', 'Movie']


In [1545]:
df['genres'] = df['genres'].apply(lambda x: [g.strip() for g in x] if isinstance(x, list) else [])

df = df.join(
    df['genres']
    .explode()
    .str.get_dummies()
    .groupby(level=0)
    .max()
)

df = df.drop(columns=['genres'])

In [1546]:
df.head()

Unnamed: 0,budget,homepage,original_language,popularity,production_companies,production_countries,release_date,revenue,runtime,vote_average,...,Fantasy,Fiction,History,Horror,Music,Mystery,Romance,Science,Thriller,War
0,237000000,1,en,150.437577,"Ingenious Film Partners, Twentieth Century Fox...","United States of America, United Kingdom",2009.0,2787965087,162.0,7.2,...,1,1,0,0,0,0,0,1,0,0
1,300000000,1,en,139.082615,"Walt Disney Pictures, Jerry Bruckheimer Films,...",United States of America,2007.0,961000000,169.0,6.9,...,1,0,0,0,0,0,0,0,0,0
2,245000000,1,en,107.376788,"Columbia Pictures, Danjaq, B24","United Kingdom, United States of America",2015.0,880674609,148.0,6.3,...,0,0,0,0,0,0,0,0,0,0
3,250000000,1,en,112.31295,"Legendary Pictures, Warner Bros., DC Entertain...",United States of America,2012.0,1084939099,165.0,7.6,...,0,0,0,0,0,0,0,0,1,0
4,260000000,1,en,43.926995,Walt Disney Pictures,United States of America,2012.0,284139100,132.0,6.1,...,0,1,0,0,0,0,0,1,0,0


### **Number of Language**

In [1547]:
unique_count = df['original_language'].nunique()
print("Number of unique values:", unique_count)

Number of unique values: 37


In [1548]:
unique_values = df['original_language'].unique()
print(unique_values)


['en' 'ja' 'fr' 'zh' 'es' 'de' 'hi' 'ru' 'ko' 'te' 'cn' 'it' 'nl' 'ta'
 'sv' 'th' 'da' 'xx' 'hu' 'cs' 'pt' 'is' 'tr' 'nb' 'af' 'pl' 'he' 'ar'
 'vi' 'ky' 'id' 'ro' 'fa' 'no' 'sl' 'ps' 'el']


In [1549]:
value_counts = df['original_language'].value_counts()
print(value_counts)

original_language
en    4505
fr      70
es      32
zh      27
de      27
hi      19
ja      16
it      14
cn      12
ko      11
ru      11
pt       9
da       7
sv       5
nl       4
fa       4
th       3
he       3
id       2
cs       2
ta       2
ro       2
ar       2
te       1
hu       1
xx       1
af       1
is       1
tr       1
vi       1
pl       1
nb       1
ky       1
no       1
sl       1
ps       1
el       1
Name: count, dtype: int64


In [1550]:
lang_counts = df['original_language'].value_counts()

low_count_langs = lang_counts[lang_counts < 20].index.tolist()

df['original_language'] = df['original_language'].apply(
    lambda x: x if x not in low_count_langs else None
)

print(f"Languages with counts less than 20 removed from 'original_language' column: {low_count_langs}")

Languages with counts less than 20 removed from 'original_language' column: ['hi', 'ja', 'it', 'cn', 'ko', 'ru', 'pt', 'da', 'sv', 'nl', 'fa', 'th', 'he', 'id', 'cs', 'ta', 'ro', 'ar', 'te', 'hu', 'xx', 'af', 'is', 'tr', 'vi', 'pl', 'nb', 'ky', 'no', 'sl', 'ps', 'el']


In [1551]:
df = pd.get_dummies(df, columns=['original_language'], prefix='lang', dtype=int)

In [1552]:
df.head()

Unnamed: 0,budget,homepage,popularity,production_companies,production_countries,release_date,revenue,runtime,vote_average,vote_count,...,Mystery,Romance,Science,Thriller,War,lang_de,lang_en,lang_es,lang_fr,lang_zh
0,237000000,1,150.437577,"Ingenious Film Partners, Twentieth Century Fox...","United States of America, United Kingdom",2009.0,2787965087,162.0,7.2,11800,...,0,0,1,0,0,0,1,0,0,0
1,300000000,1,139.082615,"Walt Disney Pictures, Jerry Bruckheimer Films,...",United States of America,2007.0,961000000,169.0,6.9,4500,...,0,0,0,0,0,0,1,0,0,0
2,245000000,1,107.376788,"Columbia Pictures, Danjaq, B24","United Kingdom, United States of America",2015.0,880674609,148.0,6.3,4466,...,0,0,0,0,0,0,1,0,0,0
3,250000000,1,112.31295,"Legendary Pictures, Warner Bros., DC Entertain...",United States of America,2012.0,1084939099,165.0,7.6,9106,...,0,0,0,1,0,0,1,0,0,0
4,260000000,1,43.926995,Walt Disney Pictures,United States of America,2012.0,284139100,132.0,6.1,2124,...,0,0,1,0,0,0,1,0,0,0


### **Number of Production Company**

In [1553]:
unique_count = df['production_companies'].str.split(', ').explode().nunique()
print("Number of unique countries:", unique_count)

Number of unique countries: 5026


In [1554]:
unique_values = df['production_companies'].str.split(', ').explode().unique()
print(unique_values)

['Ingenious Film Partners' 'Twentieth Century Fox Film Corporation'
 'Dune Entertainment' ... 'Front Street Pictures'
 'rusty bear entertainment' 'lucky crow films']


In [1555]:
value_counts = df['production_companies'].str.split(', ').explode().value_counts()
print(value_counts)

production_companies
                                          351
Warner Bros.                              319
Universal Pictures                        311
Paramount Pictures                        285
Twentieth Century Fox Film Corporation    222
                                         ... 
Pitchblack Pictures Inc.                    1
Event Film Distribution                     1
Amok Productions                            1
lucky crow films                            1
Grindfest                                   1
Name: count, Length: 5026, dtype: int64


In [1556]:
company_counts = df['production_companies'].explode().value_counts()

low_count_companies = company_counts[company_counts < 100].index.tolist()

df['production_companies'] = df['production_companies'].apply(
    lambda x: [company for company in x if company not in low_count_companies] if isinstance(x, list) else []
)

print(f"Production companies with counts less than 100 removed from 'production_companies' column: {low_count_companies}")


Production companies with counts less than 20 removed from 'production_companies' column: ['Paramount Pictures', 'Universal Pictures', 'New Line Cinema', 'Columbia Pictures', 'Metro-Goldwyn-Mayer (MGM)', 'Twentieth Century Fox Film Corporation', 'Warner Bros.', 'Walt Disney Pictures', 'Touchstone Pictures', 'Dimension Films', 'Miramax Films', 'Columbia Pictures Corporation', 'DreamWorks Animation', 'United Artists', 'Walt Disney Pictures, Pixar Animation Studios', 'Fox 2000 Pictures', 'Walt Disney Pictures, Walt Disney Feature Animation', 'Fox Searchlight Pictures', 'Imagine Entertainment, Universal Pictures', 'Lions Gate Films', 'Blue Sky Studios, Twentieth Century Fox Animation', 'Marvel Studios', 'Hollywood Pictures, Cinergi Pictures Entertainment', 'Eon Productions', 'United Artists, Eon Productions, Danjaq', 'DreamWorks SKG', 'TriStar Pictures', 'Orion Pictures', 'Walt Disney Productions', 'DreamWorks SKG, Pacific Data Images (PDI), DreamWorks Animation', 'Universal Pictures, Ambl

In [1557]:
df.head()

Unnamed: 0,budget,homepage,popularity,production_companies,production_countries,release_date,revenue,runtime,vote_average,vote_count,...,Mystery,Romance,Science,Thriller,War,lang_de,lang_en,lang_es,lang_fr,lang_zh
0,237000000,1,150.437577,[],"United States of America, United Kingdom",2009.0,2787965087,162.0,7.2,11800,...,0,0,1,0,0,0,1,0,0,0
1,300000000,1,139.082615,[],United States of America,2007.0,961000000,169.0,6.9,4500,...,0,0,0,0,0,0,1,0,0,0
2,245000000,1,107.376788,[],"United Kingdom, United States of America",2015.0,880674609,148.0,6.3,4466,...,0,0,0,0,0,0,1,0,0,0
3,250000000,1,112.31295,[],United States of America,2012.0,1084939099,165.0,7.6,9106,...,0,0,0,1,0,0,1,0,0,0
4,260000000,1,43.926995,[],United States of America,2012.0,284139100,132.0,6.1,2124,...,0,0,1,0,0,0,1,0,0,0


### **Number of Production Country**

In [1558]:
unique_count = df['production_countries'].str.split(', ').explode().nunique()
print("Number of unique countries:", unique_count)

Number of unique countries: 89


In [1559]:
unique_values = df['production_countries'].str.split(', ').explode().unique()
print(unique_values)

['United States of America' 'United Kingdom' 'Jamaica' 'Bahamas'
 'Dominica' 'Czech Republic' 'Poland' 'Slovenia' 'New Zealand' 'Germany'
 'China' 'Canada' 'Italy' 'Japan' 'Malta' 'Australia' 'France' 'Belgium'
 'India' 'Netherlands' 'Spain' 'United Arab Emirates' 'Hong Kong' 'Taiwan'
 'Ireland' 'Morocco' '' 'Hungary' 'Singapore' 'Norway' 'Sweden'
 'South Africa' 'Russia' 'Romania' 'Mexico' 'Monaco' 'Switzerland'
 'Pakistan' 'Malaysia' 'Finland' 'Iceland' 'Denmark' 'Tunisia'
 'Philippines' 'Bulgaria' 'South Korea' 'Brazil' 'Peru' 'Luxembourg'
 'Bosnia and Herzegovina' 'Kazakhstan' 'Portugal' 'Aruba'
 'Libyan Arab Jamahiriya' 'Serbia' 'Ukraine' 'Chile' 'Argentina' 'Panama'
 'Austria' 'Greece' 'Lithuania' 'Cambodia' 'Thailand' 'Slovakia' 'Israel'
 'Fiji' 'Serbia and Montenegro' 'Turkey' 'Nigeria' 'Cyprus' 'Jordan'
 'Bolivia' 'Ecuador' 'Colombia' 'Egypt' 'Bhutan' 'Lebanon'
 'Kyrgyz Republic' 'Algeria' 'Indonesia' 'Guyana' 'Iran' 'Guadaloupe'
 'Afghanistan' 'Angola' 'Dominican Republic' 'C

In [1560]:
value_counts = df['production_countries'].str.split(', ').explode().value_counts()
print(value_counts)

production_countries
United States of America    3956
United Kingdom               636
Germany                      324
France                       306
Canada                       261
                            ... 
Afghanistan                    1
Angola                         1
Dominican Republic             1
Cameroon                       1
Kenya                          1
Name: count, Length: 89, dtype: int64


### **Number of Cast**

In [1561]:
unique_count = df['cast'].explode().nunique()
print("Number of unique cast:", unique_count)

Number of unique cast: 12221


In [1562]:
unique_cast = df['cast'].explode().unique()
print(unique_cast)

['Sam Worthington' 'Zoe Saldana' 'Sigourney Weaver' ... 'Alan Ruck'
 'Zhu Shimao' 'Brian Herzlinger']


In [1563]:
genre_counts = df['cast'].explode().value_counts()
print(genre_counts)

cast
Robert De             51
Samuel L.             41
Bruce Willis          38
Matt Damon            35
Morgan Freeman        34
                      ..
Mendes                 1
Duncan Eva             1
Barry Newman           1
Emmanuelle Vaugier     1
D'Arcy                 1
Name: count, Length: 12221, dtype: int64


### **Number of Director**

In [1564]:
unique_count = df['director'].explode().nunique()
print("Number of unique director:", unique_count)

Number of unique director: 2523


In [1565]:
unique_director = df['director'].explode().unique()
print(unique_director)

['James Cameron' 'Gore Verbinski' 'Sam Mendes' ... 'Scott Smith'
 'Daniel Hsia' 'Brian Herzlinger']


In [1566]:
genre_counts = df['director'].explode().value_counts()
print(genre_counts)

director
Steven Spielberg    27
Woody Allen         21
Martin Scorsese     20
Clint Eastwood      20
Anderson            18
                    ..
Jim Chuchu           1
Marcus Nispel        1
Joseph Mazzella      1
Eric Eason           1
James Bidgood        1
Name: count, Length: 2523, dtype: int64


### **Transform all the data to numerical value**

In [1567]:
columns_to_drop = ['production_companies', 'production_countries', 'cast', 'director']
df.drop(columns=columns_to_drop, inplace=True)

In [1568]:
df.head()

Unnamed: 0,budget,homepage,popularity,release_date,revenue,runtime,vote_average,vote_count,Action,Adventure,...,Mystery,Romance,Science,Thriller,War,lang_de,lang_en,lang_es,lang_fr,lang_zh
0,237000000,1,150.437577,2009.0,2787965087,162.0,7.2,11800,1,1,...,0,0,1,0,0,0,1,0,0,0
1,300000000,1,139.082615,2007.0,961000000,169.0,6.9,4500,1,1,...,0,0,0,0,0,0,1,0,0,0
2,245000000,1,107.376788,2015.0,880674609,148.0,6.3,4466,1,1,...,0,0,0,0,0,0,1,0,0,0
3,250000000,1,112.31295,2012.0,1084939099,165.0,7.6,9106,1,0,...,0,0,0,1,0,0,1,0,0,0
4,260000000,1,43.926995,2012.0,284139100,132.0,6.1,2124,1,1,...,0,0,1,0,0,0,1,0,0,0


In [1569]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 31 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   budget        4803 non-null   int64  
 1   homepage      4803 non-null   int64  
 2   popularity    4803 non-null   float64
 3   release_date  4802 non-null   float64
 4   revenue       4803 non-null   int64  
 5   runtime       4801 non-null   float64
 6   vote_average  4803 non-null   float64
 7   vote_count    4803 non-null   int64  
 8   Action        4803 non-null   int64  
 9   Adventure     4803 non-null   int64  
 10  Animation     4803 non-null   int64  
 11  Comedy        4803 non-null   int64  
 12  Crime         4803 non-null   int64  
 13  Documentary   4803 non-null   int64  
 14  Drama         4803 non-null   int64  
 15  Family        4803 non-null   int64  
 16  Fantasy       4803 non-null   int64  
 17  Fiction       4803 non-null   int64  
 18  History       4803 non-null 

In [1570]:
df.isnull().sum()

Unnamed: 0,0
budget,0
homepage,0
popularity,0
release_date,1
revenue,0
runtime,2
vote_average,0
vote_count,0
Action,0
Adventure,0


In [1571]:
df.dropna(inplace=True)

In [1572]:
df.isnull().sum()

Unnamed: 0,0
budget,0
homepage,0
popularity,0
release_date,0
revenue,0
runtime,0
vote_average,0
vote_count,0
Action,0
Adventure,0


In [1573]:
df.duplicated()
print(f"Number of duplicated rows: {df.duplicated().sum()}")

Number of duplicated rows: 0


In [1574]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4800 entries, 0 to 4802
Data columns (total 31 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   budget        4800 non-null   int64  
 1   homepage      4800 non-null   int64  
 2   popularity    4800 non-null   float64
 3   release_date  4800 non-null   float64
 4   revenue       4800 non-null   int64  
 5   runtime       4800 non-null   float64
 6   vote_average  4800 non-null   float64
 7   vote_count    4800 non-null   int64  
 8   Action        4800 non-null   int64  
 9   Adventure     4800 non-null   int64  
 10  Animation     4800 non-null   int64  
 11  Comedy        4800 non-null   int64  
 12  Crime         4800 non-null   int64  
 13  Documentary   4800 non-null   int64  
 14  Drama         4800 non-null   int64  
 15  Family        4800 non-null   int64  
 16  Fantasy       4800 non-null   int64  
 17  Fiction       4800 non-null   int64  
 18  History       4800 non-null   int

In [1575]:
df.head()

Unnamed: 0,budget,homepage,popularity,release_date,revenue,runtime,vote_average,vote_count,Action,Adventure,...,Mystery,Romance,Science,Thriller,War,lang_de,lang_en,lang_es,lang_fr,lang_zh
0,237000000,1,150.437577,2009.0,2787965087,162.0,7.2,11800,1,1,...,0,0,1,0,0,0,1,0,0,0
1,300000000,1,139.082615,2007.0,961000000,169.0,6.9,4500,1,1,...,0,0,0,0,0,0,1,0,0,0
2,245000000,1,107.376788,2015.0,880674609,148.0,6.3,4466,1,1,...,0,0,0,0,0,0,1,0,0,0
3,250000000,1,112.31295,2012.0,1084939099,165.0,7.6,9106,1,0,...,0,0,0,1,0,0,1,0,0,0
4,260000000,1,43.926995,2012.0,284139100,132.0,6.1,2124,1,1,...,0,0,1,0,0,0,1,0,0,0
