In [24]:
!pip install dask nltk graphviz



## Import Python Libtraries for BIg Data

In [1]:
# Dask components
import dask.dataframe as dd
import dask.array as da
from dask import delayed, compute

import pandas as pd
import numpy as np

# Import natural language library
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

[nltk_data] Downloading package wordnet to /home/ryan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Import Our Data

In [2]:
movies_df = dd.read_csv(
    'movies.csv',
    blocksize = '1MB' ,
    sample = 5000  ,
)
movies_pd = movies_df.compute()

movies_df

Unnamed: 0_level_0,Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,status,tagline,vote_average,vote_count,credits,keywords,poster_path,backdrop_path,recommendations,release_date_new
npartitions=5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
,int64,int64,string,string,string,string,float64,string,string,float64,float64,float64,string,string,float64,float64,string,string,string,string,string,string
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...




In [10]:
print(type(movies_df), type(movies_pd))

<class 'dask.dataframe.core.DataFrame'> <class 'pandas.core.frame.DataFrame'>


## Missing Values Calculation

In [3]:
task_missing = movies_df.isnull().sum()

# Compute the task graph
task_missing.compute()

Unnamed: 0                 0
id                         0
title                      0
genres                     0
original_language          0
overview                 489
popularity                 0
production_companies    2197
release_date             266
budget                     0
revenue                    0
runtime                   81
status                     0
tagline                 5129
vote_average               0
vote_count                 0
credits                  324
keywords                4010
poster_path              785
backdrop_path           3385
recommendations         6623
release_date_new         266
dtype: int64

In [32]:
task_missing.visualize(engine = "cytoscape")

CytoscapeWidget(cytoscape_layout={'name': 'dagre', 'rankDir': 'BT', 'nodeSep': 10, 'edgeSep': 10, 'spacingFact…

## Creating Datetime Features from Strings

In [4]:
movies_df['release_date_dt'] = dd.to_datetime(
    movies_df['release_date'], errors='coerce' ,
    exact = False, format = '%Y-%m-%d'
)

## Engineer a Year, Month, and Day of the Week String features

In [11]:
movies_df['year'] = movies_df['release_date_dt'].dt.strftime('%Y')  # Year
movies_df['month'] = movies_df['release_date_dt'].dt.strftime('%b')  # Abbreviated named of month
movies_df['day_of_week'] = movies_df['release_date_dt'].dt.strftime('%a')  # Abbreviated day of the week

In [13]:
movies_df.head(2)

Unnamed: 0.1,Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,...,credits,keywords,poster_path,backdrop_path,recommendations,release_date_new,release_date_dt,year,month,day_of_week
0,12,594767,Shazam! Fury of the Gods,action-comedy-fantasy-adventure,en,Billy Batson and his foster siblings who trans...,2010.98,New Line Cinema-The Safran Company-DC Films,2023-03-15,125000000.0,...,Zachary Levi-Asher Angel-Jack Dylan Grazer-Rac...,superhero-end of the world-super power-aftercr...,/A3ZbZsmsvNGdprRi2lKgGEeVLEH.jpg,/nDxJJyA5giRhXx96q1sWbOUjMBI.jpg,868759-994751-700391-948713-502356-938992-7660...,2023-03-15,2023-03-15,2023,Mar,Wed
1,18,615656,Meg 2: The Trench,action-science fiction-horror-comedy,en,An exploratory dive into the deepest depths of...,1321.17,Apelles Entertainment-Warner Bros. Pictures-di...,2023-08-02,129000000.0,...,Jason Statham-Wu Jing-Shuya Sophia Cai-Sergio ...,based on novel or book-sequel-kaiju,/4m1Au3YkjqsxF8iwQy0fPYSxE0h.jpg,/Aukfa8dk6B5OxuelbaPBOJYXaBI.jpg,447277-872585-346698-1083862-496450-457332-114...,2023-08-02,2023-08-02,2023,Aug,Wed


## Filter by Datetime Intervals & Sort Our Data

In [22]:
%%time

# Filter by time index between 2016 and 2024 (time interval)
filtered_df = movies_df.loc[
    (movies_df['release_date_dt'] > '2015-12-31') & (movies_df['release_date_dt'] < '2024'), :
].compute().sort_values(['budget', 'release_date_dt'], ascending=False)  # persist to memory and THEN sort the values by budget & date


print(f'Number of matching observations: {len(filtered_df)}')
filtered_df.head(5)

Number of matching observations: 2539
CPU times: user 333 ms, sys: 63.5 ms, total: 397 ms
Wall time: 349 ms


Unnamed: 0.1,Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,...,credits,keywords,poster_path,backdrop_path,recommendations,release_date_new,release_date_dt,year,month,day_of_week
6,77,616037,Thor: Love and Thunder,fantasy-action-comedy,en,After his retirement is interrupted by Gorr th...,394.087,Marvel Studios-Kevin Feige Productions,2022-07-06,250000000.0,...,Chris Hemsworth-Natalie Portman-Christian Bale...,ex-girlfriend-hero-greek mythology-sequel-supe...,/pIkRyD18kl4FhoCNQuWxWu5cBLM.jpg,/jsoz1HlxczSuTx0mDl2h0lxy36l.jpg,539681-610150-985939-629176-2-45920-438148-782...,2022-07-06,2022-07-06,2022,Jul,Wed
27,335,508943,Luca,animation-comedy-family-fantasy-action-adventure,en,Luca and his best friend Alberto experience an...,132.456,Pixar-Walt Disney Pictures,2021-06-17,200000000.0,...,Jacob Tremblay-Jack Dylan Grazer-Emma Berman-S...,italy-monster-friendship-friends-coming of age...,/8tABCBpzu3mZbzMB3sRzMEHEvJi.jpg,/620hnMVLu6RSZW6a5rwO8gqpt0t.jpg,527774-77742-79233-337404-800409-497698-400216...,2021-06-17,2021-06-17,2021,Jun,Thu
83,1491,508439,Onward,family-animation-adventure-comedy-fantasy-action,en,In a suburban fantasy world two teenage elf br...,47.176,Walt Disney Pictures-Pixar,2020-02-29,200000000.0,...,Tom Holland-Chris Pratt-Julia Louis-Dreyfus-Oc...,elves-magic-dead father-dead parent-fantasy wo...,/f4aul3FyD3jv3v4bul1IrkWZvzq.jpg,/xFxk4vnirOtUxpOEWgA1MCRfy6J.jpg,726166-662018-872325-611059-1038789-579955-454...,2020-02-29,2020-02-29,2020,Feb,Sat
28,341,384018,Fast & Furious Presents: Hobbs & Shaw,action-adventure-comedy,en,Ever since US Diplomatic Security Service Agen...,130.097,Universal Pictures-Chris Morgan Productions-Se...,2019-08-01,200000000.0,...,Dwayne Johnson-Jason Statham-Idris Elba-Vaness...,london england-biological weapon-secret organi...,/qRyy2UmjC5ur9bDi3kpNNRCc5nc.jpg,/hpgda6P9GutvdkDX5MUJ92QG9aj.jpg,337339-429617-458156-423204-420818-168259-9615...,2019-08-01,2019-08-01,2019,Aug,Thu
25,296,436969,The Suicide Squad,action-comedy-adventure,en,Supervillains Harley Quinn Bloodsport Peacemak...,144.638,DC Films-Atlas Entertainment-The Safran Compan...,2021-07-28,185000000.0,...,Margot Robbie-Idris Elba-John Cena-Joel Kinnam...,monster-anti hero-secret mission-superhero-bas...,/kb4s0ML0iVZlG6wAKbbs9NAm6X.jpg,/jlGmlFOcfo8n5tURmhC7YVd4Iyy.jpg,451048-497698-385128-550988-193414-566525-7913...,2021-07-28,2021-07-28,2021,Jul,Wed


In [24]:
movies_df

Unnamed: 0_level_0,Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,status,tagline,vote_average,vote_count,credits,keywords,poster_path,backdrop_path,recommendations,release_date_new,release_date_dt,year,month,day_of_week
npartitions=5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
,int64,int64,string,string,string,string,float64,string,string,float64,float64,float64,string,string,float64,float64,string,string,string,string,string,string,datetime64[ns],string,string,string
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


## Aggregate Total Movie Budget by Year

In [31]:
%%time

# Aggregation by year to find the total sum of movie budgets, computed it, and then we sorted the Pandas Series
budget_summation = movies_df.groupby('year').agg(
    {
        'budget': 'sum'
    }
).compute().sort_values(by = ['budget'] , ascending = False)

budget_summation
# budget_summation.visualize(engine='cytoscape')

CPU times: user 241 ms, sys: 93.1 ms, total: 334 ms
Wall time: 305 ms


Unnamed: 0_level_0,budget
year,Unnamed: 1_level_1
2013,1.141693e+09
2016,1.126214e+09
2022,1.108168e+09
2017,1.098265e+09
2010,1.097269e+09
...,...
1939,0.000000e+00
1932,0.000000e+00
1945,0.000000e+00
1927,0.000000e+00


### Text Mining for Movie Descriptions

In [42]:
def find_synonyms(word_list) :
    all_matches = []

    for i in range(0, len(word_list)) :
        synonyms = []
        word = word_list[i]
        word_nltk = wordnet.synsets(word)
        for synonym in word_nltk :
            for lemma in synonym.lemmas() :
                name = str(lemma.name())
                name = name.replace('_', ' ')
                synonyms.append(name)

        synonyms_set = list(set(synonyms))
        all_matches.append(synonyms_set)

    # Escape the for-loop and flatten our list of synonyms to one single list
    flat_list = []
    for sublist in all_matches :
        for element in sublist :
            flat_list.append(element)
    return list(set(flat_list))

### List of Synonyms

In [49]:
word_list = ['happy', 'joy', 'good', 'nice', 'comedy', 'kid']

synonym_list = find_synonyms(word_list)

## Search for Matching Movie Genres

In [53]:
%%time

# Converting strings to all lower-case
movies_df['genres'] = movies_df['genres'].str.lower()

# Concatenate our list of synonyms into one string....seperate by '|'
synonym_string ='|'.join(synonym_list)
synonym_string

# String-based matching search
feel_good_df = movies_df.loc[movies_df['genres'].str.contains(synonym_string), :]

filtered_movies_df = feel_good_df.compute().sort_values(by=['popularity'], ascending=False)
filtered_movies_df

CPU times: user 288 ms, sys: 114 ms, total: 402 ms
Wall time: 359 ms


Unnamed: 0.1,Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,...,credits,keywords,poster_path,backdrop_path,recommendations,release_date_new,release_date_dt,year,month,day_of_week
0,12,594767,Shazam! Fury of the Gods,action-comedy-fantasy-adventure,en,Billy Batson and his foster siblings who trans...,2010.980,New Line Cinema-The Safran Company-DC Films,2023-03-15,125000000.0,...,Zachary Levi-Asher Angel-Jack Dylan Grazer-Rac...,superhero-end of the world-super power-aftercr...,/A3ZbZsmsvNGdprRi2lKgGEeVLEH.jpg,/nDxJJyA5giRhXx96q1sWbOUjMBI.jpg,868759-994751-700391-948713-502356-938992-7660...,2023-03-15,2023-03-15,2023,Mar,Wed
1,18,615656,Meg 2: The Trench,action-science fiction-horror-comedy,en,An exploratory dive into the deepest depths of...,1321.170,Apelles Entertainment-Warner Bros. Pictures-di...,2023-08-02,129000000.0,...,Jason Statham-Wu Jing-Shuya Sophia Cai-Sergio ...,based on novel or book-sequel-kaiju,/4m1Au3YkjqsxF8iwQy0fPYSxE0h.jpg,/Aukfa8dk6B5OxuelbaPBOJYXaBI.jpg,447277-872585-346698-1083862-496450-457332-114...,2023-08-02,2023-08-02,2023,Aug,Wed
2,19,868759,Ghosted,romance-action-comedy,en,Salt-of-the-earth Cole falls head over heels f...,1214.867,Skydance Media-Apple Studios,2023-04-18,0.0,...,Chris Evans-Ana de Armas-Adrien Brody-Mike Moh...,secret agent,/liLN69YgoovHVgmlHJ876PKi5Yi.jpg,/b9UCfDzwiWw7mIFsIQR9ZJUeh7q.jpg,640146-726759,2023-04-18,2023-04-18,2023,Apr,Tue
3,22,758009,Shotgun Wedding,action-romance-comedy,en,Darcy and Tom gather their families for the ul...,1043.225,Lionsgate-Mandeville Films-Nuyorican Productio...,2022-12-28,0.0,...,Jennifer Lopez-Josh Duhamel-Jennifer Coolidge-...,wedding-hostage situation,/t79ozwWnwekO0ADIzsFP1E5SkvR.jpg,/zGoZB4CboMzY1z4G3nU6BWnMDB2.jpg,702432-1064489-1013870-953734-805307-753965-84...,2022-12-28,2022-12-28,2022,Dec,Wed
4,48,587092,Unicorn Wars,action-animation-comedy-fantasy-horror-war,es,An army of bear cubs train and indoctrinate yo...,535.524,UniKo-Schmuby Productions-Autour de Minuit-Pan...,2022-10-21,0.0,...,Jon Goiri-Jaione Insausti-Ramón Barea-Txema Re...,gore-bear-unicorn-war-animation,/8KBj11zBaRdhoeq1q9jcAwKmDSk.jpg,/rbUPJoJJquPbX1AiV6GzOqcmJME.jpg,852046-601796,2022-10-21,2022-10-21,2022,Oct,Fri
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
978,90620,789367,Spy Catcher,comedy-action-adventure,en,Natasha finds out that her handler has gone ba...,0.600,,,0.0,...,Elan Andreassen-Andii Zhebrovskyi-Abbey St. Br...,,,,,,NaT,,,
977,90590,789096,Mancoro,action-fantasy-animation-comedy,id,Mas Bejo who was too much in arrears was taken...,0.600,Javora Film,2020-12-05,0.0,...,Gugun Arief-Wah Rahayu,,/hPyeLIcKPDQya0R84z0HelQJ1p7.jpg,,,2020-12-05,2020-12-05,2020,Dec,Sat
976,90153,789339,Mad Martha: Church Warrior,action-comedy-horror,en,A B-movie horror-comedy set in a post-apocalyp...,0.600,,2017-07-08,0.0,...,Ellen Williams-Andrew Hunsicker-Kerwin Gonzale...,mad martha-church warrior,/lz1VeCYWSugjDxzk978o7LM7NAr.jpg,,,2017-07-08,2017-07-08,2017,Jul,Sat
975,89662,757541,Uchuu Sentai Kyuranger: Final Stage,fantasy-science fiction-action-comedy,ja,Annual Super Sentai stage show featuring the c...,0.600,Toei Company,2017-12-06,0.0,...,Takumi Kizu-Yousuke Kishi-Kazuya Nakai-Yuki On...,ranger-stage show-superhero-space-squadron-sup...,/vPELafsbRkCNMRkwulggM0HIc38.jpg,,,2017-12-06,2017-12-06,2017,Dec,Wed
