# Movies Investment Prediction using a LRM

Source: Box Office Mojo 
https://www.boxofficemojo.com/

##### Web Scrapping

Let's import our libraries

In [2]:
#scrapping and cleaning process

from bs4 import BeautifulSoup as BS
import requests
from requests import get
from IPython.core.display import display, HTML
import re
import pandas as pd
import numpy as np
from time import sleep
from random import randint
import dateutil.parser

#visualization

import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_formats = ['svg']  # or svg
%matplotlib inline


## Create functions to grab values

In [791]:
def get_movie_value(soup, field):
    obj = soup.find(text=re.compile(field))
    if not obj:
        return None
    next_element = obj.findNext()
    if next_element:
        return next_element.text
    else:
        return None

In [792]:
# Datetime function to parse "datatime" strings into appropriate "Date" data type

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

### Titles

In [25]:
#Let's start writing functions to get the values we want

def get_movie_title(soup):
    title_element = soup.find("title")
    if not title_element:
        return None

    title_text = title_element.text
    if "-" in title_text:
        title_parts = title_text.split("-")
        if title_parts.count == 2:
            return title_parts[0].strip()
        else:
            return "-".join(title_parts[0:-1]).strip()
    else:
        return title_text.strip()
 

### Runtime

In [26]:
def get_movie_runtime(soup):
    runtimestring = get_movie_value(soup, "Running")
    try:
        runtime = runtimestring.split()
        minutes = int(runtime[0]) * 60 + int(runtime[2])
        return minutes
    except:
        return None

### Widest Release (Theaters)

In [27]:
def get_movie_widest_release(soup):
    widest_release_value = get_movie_value(soup, "Widest Release")
    if not widest_release_value:
        return None
    widest_release_value = widest_release_value.replace(",", "").split(" ")[0]
    widest_release_value = widest_release_value.strip()
    return int(widest_release_value)

### Domestic Gross

In [28]:
def get_movie_domestic_gross(soup):
    moneystring = (soup.find(class_="mojo-performance-summary-table")
        .find_all("span", class_="money")[0].text)
    moneystring = moneystring.replace("$", "").replace(",", "")
    return int(moneystring)

### Budget

In [29]:
#def get_movie_budget(soup)

def get_movie_budget(soup):
    raw_budget = get_movie_value(soup,"Budget")
    try:
        budget = raw_budget.replace("$", "").replace(",", "")
        return int(budget)
    except:
        return None

### Worldwide Gross

In [30]:
def get_movie_worldwide_gross(soup):
    moneystring = (soup.find(class_="mojo-performance-summary-table")
        .find_all("span", class_="money")[-1].text)
    moneystring = moneystring.replace("$", "").replace(",", "")
    return int(moneystring)

### Rating

In [31]:
def get_movie_rating(soup):
    rating_value = get_movie_value(soup, "MPAA")
    if not rating_value:
        return None
    return rating_value.strip()

### Genres

In [32]:
def get_movie_genres(soup):
    genres_value = get_movie_value(soup, "Genres")
    if not genres_value:
        return None
    genres_value = genres_value.replace('\n    \n       ','')
    return genres_value.strip()

### Distributors

In [33]:
def get_movie_distributor(soup):
    distributor1 = get_movie_value(soup, "Distributor")
    distributor = distributor1.split('See')[0]
    if not distributor:
        return None
    return distributor.strip()

### Opening Gross

In [34]:
def get_movie_opening_gross(soup):
    element1 = soup.find(
        class_="a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile"
    )
    if not element1:
        return None
    element2 = element1.find("span", class_="money")
    if not element2:
        return None
    opening = element2.text.replace("$", "").replace(",", "")
    return int(opening)

### Release Date

In [35]:
def get_release_date(soup):
    date = get_movie_value(soup, "Release Date").split('-')[0].split('(')[0].strip()
    date = to_date(date)
    if not date:
        return None
    return date

### List of Features of all movies

In [36]:
"""def get_movie_data(url):
    response = requests.get(url)
    page = response.text
    soup = BS(page, "lxml")
    
    title = get_movie_title(soup)
    budget = get_movie_budget(soup)
    runtime = get_movie_runtime(soup)
    rating = get_movie_rating(soup)
    genres = get_movie_genres(soup)
    distributor1 = get_movie_distributor(soup)
    domestic = get_movie_domestic_gross(soup)
    worldwide = get_movie_worldwide_gross(soup)
    opening = get_movie_opening_gross(soup)
    widest_release = get_movie_widest_release(soup)
    release_date = get_release_date(soup)


    headers = ["title", "budget", "runtime", "rating", "genres", "distributor", "domestic_gross", "worldwide_gross", "opening_gross", "widest_release", "release_date"]
    movie_dict = dict(zip(headers, [title, budget, runtime, rating, genres, distributor1, domestic, worldwide, opening, widest_release, release_date]))
    
    return movie_dict"""


## Urls for all movies

In [37]:
"""def get_movie_urls():
    url = "https://www.boxofficemojo.com/year/2019/?ref_=bo_yl_table_3"
    url2 = "https://www.boxofficemojo.com/year/2018/?ref_=bo_yl_table_4"
    url3 = "https://www.boxofficemojo.com/year/2017/?ref_=bo_yl_table_5"
    url4 = "https://www.boxofficemojo.com/year/2016/?ref_=bo_yl_table_6"
    url5 = "https://www.boxofficemojo.com/year/2015/?ref_=bo_yl_table_7"

    url_list = [url, url2, url3, url4, url5]
    
    all_titles = []
    
    for url in url_list:
        page = requests.get(url)
        soup = BS(page.text, "lxml")
        titles = soup.find_all("td", class_="a-text-left mojo-field-type-release mojo-cell-wide")
        for title in titles:
            all_titles.append("https://www.boxofficemojo.com/" + title.select("a")[0].get("href"))
    return all_titles"""

In [38]:
"urls = get_movie_urls()"

In [39]:
"urls"

['https://www.boxofficemojo.com//release/rl3059975681/?ref_=bo_yld_table_1',
 'https://www.boxofficemojo.com//release/rl3321923073/?ref_=bo_yld_table_2',
 'https://www.boxofficemojo.com//release/rl3798500865/?ref_=bo_yld_table_3',
 'https://www.boxofficemojo.com//release/rl2424210945/?ref_=bo_yld_table_4',
 'https://www.boxofficemojo.com//release/rl3009644033/?ref_=bo_yld_table_5',
 'https://www.boxofficemojo.com//release/rl3305145857/?ref_=bo_yld_table_6',
 'https://www.boxofficemojo.com//release/rl3791750657/?ref_=bo_yld_table_7',
 'https://www.boxofficemojo.com//release/rl3246360065/?ref_=bo_yld_table_8',
 'https://www.boxofficemojo.com//release/rl252151297/?ref_=bo_yld_table_9',
 'https://www.boxofficemojo.com//release/rl1107461633/?ref_=bo_yld_table_10',
 'https://www.boxofficemojo.com//release/rl755467777/?ref_=bo_yld_table_11',
 'https://www.boxofficemojo.com//release/rl1711506945/?ref_=bo_yld_table_12',
 'https://www.boxofficemojo.com//release/rl2919400961/?ref_=bo_yld_table_13

### Counter I

In [612]:
"""#Let's run the data to see if we get any error

count = 1
for url in urls:
    print(f"{count}: {url}")
    movie_data = get_movie_data(url)
    print(movie_data)
    count = count + 1
#   sleep(randint(1, 2))"""

'#Let\'s run the data to see if we get any error\n\ncount = 1\nfor url in urls:\n    print(f"{count}: {url}")\n    movie_data = get_movie_data(url)\n    print(movie_data)\n    count = count + 1\n#   sleep(randint(1, 2))'

## Additional Scraping

### Cast and Crew

### Link for Top Movies

In [481]:
def get_movie_links():
    link1 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=G"
    link2 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=G&offset=200"
    link3 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=G%2FPG"
    link4 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=G%2FPG&offset=200"
    link5 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=G%2FPG&offset=400"
    link6 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=G%2FPG&offset=600"
    link7 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=G%2FPG&offset=800"
    link8 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG"
    link9 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG&offset=200"
    link10 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG&offset=400"
    link11 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG&offset=600"
    link12 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG&offset=800"
    link13 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG-13"
    link14 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG-13&offset=200"
    link15 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG-13&offset=400"
    link16 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG-13&offset=600"
    link17 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG-13&offset=800"
    link18 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=R"
    link19 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=R&offset=200"
    link20 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=R&offset=400"
    link21 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=R&offset=600"
    link22 = "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=R&offset=800"

    link_list = [link1, link2, link3, link4, link5, link6, link7, link8, link9, link10, link11, link12, link13, link14, link15, link16, link17, link18, link19, link20, link21, link22]
    
    all_titles = []
    
    for link in link_list:
        page = requests.get(link)
        soup = BS(page.text, "lxml")
        titles = soup.find_all("td", class_="a-text-left mojo-header-column mojo-truncate mojo-field-type-title")
        for title in titles:
            all_titles.append("https://www.boxofficemojo.com/" + title.select("a")[0].get("href"))
    return all_titles



In [482]:
links = get_movie_links()

In [793]:
print(len(links))

4364


In [794]:
links

['https://www.boxofficemojo.com//title/tt1979376/?ref_=bo_cso_table_1',
 'https://www.boxofficemojo.com//title/tt0110357/?ref_=bo_cso_table_2',
 'https://www.boxofficemojo.com//title/tt0435761/?ref_=bo_cso_table_3',
 'https://www.boxofficemojo.com//title/tt0266543/?ref_=bo_cso_table_4',
 'https://www.boxofficemojo.com//title/tt0198781/?ref_=bo_cso_table_5',
 'https://www.boxofficemojo.com//title/tt1453405/?ref_=bo_cso_table_6',
 'https://www.boxofficemojo.com//title/tt0120363/?ref_=bo_cso_table_7',
 'https://www.boxofficemojo.com//title/tt0317219/?ref_=bo_cso_table_8',
 'https://www.boxofficemojo.com//title/tt0910970/?ref_=bo_cso_table_9',
 'https://www.boxofficemojo.com//title/tt0114709/?ref_=bo_cso_table_10',
 'https://www.boxofficemojo.com//title/tt0101414/?ref_=bo_cso_table_11',
 'https://www.boxofficemojo.com//title/tt0103639/?ref_=bo_cso_table_12',
 'https://www.boxofficemojo.com//title/tt0382932/?ref_=bo_cso_table_13',
 'https://www.boxofficemojo.com//title/tt0031381/?ref_=bo_cs

### List of Features of top movies (Cast and Crew)

### Extracting Titles for Cast and Crew values

In [795]:
def get_movie_titles2(url):
    response = requests.get(url)
    page = response.text
    soup = BS(page, "lxml")
    
    titles = []

    title_text = (soup.find("h1", class_="a-size-extra-large").text[:-7])

    for title in title_text.split("\n"):
        titles.append(title)

    return(titles)

### Years

In [796]:
def get_movie_year2(url):
    response = requests.get(url)
    page = response.text
    soup = BS(page, "lxml")
    
    years = []

    year_text = (soup.find("h1", class_="a-size-extra-large").text[-5:-1])

    for year in year_text.split("\n"):
        years.append(year)

    return(years)

### Extracting Directors from principalCrew table

In [797]:
def get_movie_director2(url):
    response = requests.get(url)
    page = response.text
    soup = BS(page, "lxml")
    
    directors = []

    crew_string = (soup.find(id="principalCrew").find_all("tr"))

    for crew_member in crew_string:
        if 'Director' in crew_member.text:
            directors.append(crew_member.text.replace('Director','').replace('\n', ''))

    return(directors)


### Extracting Actors from principalCast table

In [798]:
def get_movie_actors2(url):
    response = requests.get(url)
    page = response.text
    soup = BS(page, "lxml")
    
    actors = []

    actors_string = (soup.find(id="principalCast").find_all("tr"))

    for actor in actors_string[1:]:
        stop_point = actor.text.find('\n')
        actors.append(actor.text[:stop_point])

    return(actors)

### New links for actors and directors table

In [799]:
to_append = 'credits/?ref_=bo_tt_tab#tabs'
new_links = [link[:link.find('?ref')] + to_append for link in links]

In [800]:
links[5]

'https://www.boxofficemojo.com//title/tt1453405/?ref_=bo_cso_table_6'

In [801]:
new_links[5]

'https://www.boxofficemojo.com//title/tt1453405/credits/?ref_=bo_tt_tab#tabs'

### new pipeline for directors and actors data

In [802]:
def get_movie_data2(url):
    
    title = get_movie_titles2(url)
    director = get_movie_director2(url)
    actors = get_movie_actors2(url)
    years = get_movie_year2(url)

    headers = ["title", "director", "actors", "years"]
    movie_dict = dict(zip(headers, [title, director, actors, years]))
    
    return movie_dict

### test new pipeline with new urls

In [803]:
new_links[10]

'https://www.boxofficemojo.com//title/tt0101414/credits/?ref_=bo_tt_tab#tabs'

In [804]:
get_movie_data2(new_links[10])

{'title': ['Beauty and the Beast'],
 'director': ['Gary Trousdale', 'Kirk Wise'],
 'actors': ["Paige O'Hara", 'Robby Benson', 'Jesse Corti', 'Rex Everhart'],
 'years': ['1991']}

### test against all new urls

In [805]:
for link in new_links[:5]:
    print(get_movie_data2(link))

{'title': ['Toy Story 4'], 'director': ['Josh Cooley'], 'actors': ['Tom Hanks', 'Tim Allen', 'Annie Potts', 'Tony Hale'], 'years': ['2019']}
{'title': ['The Lion King'], 'director': ['Roger Allers', 'Rob Minkoff'], 'actors': ['Matthew Broderick', 'Jeremy Irons', 'James Earl Jones', 'Whoopi Goldberg'], 'years': ['1994']}
{'title': ['Toy Story 3'], 'director': ['Lee Unkrich'], 'actors': ['Tom Hanks', 'Tim Allen', 'Joan Cusack', 'Ned Beatty'], 'years': ['2010']}
{'title': ['Finding Nemo'], 'director': ['Andrew Stanton', 'Lee Unkrich'], 'actors': ['Albert Brooks', 'Ellen DeGeneres', 'Alexander Gould', 'Willem Dafoe'], 'years': ['2003']}
{'title': ['Monsters, Inc.'], 'director': ['Pete Docter', 'David Silverman', 'Lee Unkrich'], 'actors': ['Billy Crystal', 'John Goodman', 'Mary Gibbs', 'Steve Buscemi'], 'years': ['2001']}


### Counter II (Cast and Crew)

In [806]:
"""#Let's run the data to see if we get any error

count = 1
for link in new_links:
    print(f"{count}: {link}")
    movie_data2 = get_movie_data2(link)
    print(movie_data2)
    count = count + 1
#    sleep(randint(1, 2))"""

'#Let\'s run the data to see if we get any error\n\ncount = 1\nfor link in new_links:\n    print(f"{count}: {link}")\n    movie_data2 = get_movie_data2(link)\n    print(movie_data2)\n    count = count + 1\n#    sleep(randint(1, 2))'

In [807]:
movies_crewcast_list = []

for link in new_links:
    movies_crewcast_list.append(get_movie_data2(link)) # 3:24 - 4:04

In [808]:
movies_crewcast = pd.DataFrame(movies_crewcast_list)  #convert list of dict to df

In [809]:
movies_crewcast.shape

(4364, 4)

In [810]:
movies_crewcast.to_csv('/Users/hernantrujillo/Documents/Metis/NBM_Regression_Student-main/Movies_Project_Hernan/data_project/crew_cast_raw.csv') 

In [812]:
movies_crewcast = pd.read_csv("/Users/hernantrujillo/Documents/Metis/NBM_Regression_Student-main/Movies_Project_Hernan/data_project/crew_cast_raw.csv")

In [813]:
movies_crewcast.head()

Unnamed: 0.1,Unnamed: 0,title,director,actors,years
0,0,['Toy Story 4'],['Josh Cooley'],"['Tom Hanks', 'Tim Allen', 'Annie Potts', 'Ton...",['2019']
1,1,['The Lion King'],"['Roger Allers', 'Rob Minkoff']","['Matthew Broderick', 'Jeremy Irons', 'James E...",['1994']
2,2,['Toy Story 3'],['Lee Unkrich'],"['Tom Hanks', 'Tim Allen', 'Joan Cusack', 'Ned...",['2010']
3,3,['Finding Nemo'],"['Andrew Stanton', 'Lee Unkrich']","['Albert Brooks', 'Ellen DeGeneres', 'Alexande...",['2003']
4,4,"['Monsters, Inc.']","['Pete Docter', 'David Silverman', 'Lee Unkrich']","['Billy Crystal', 'John Goodman', 'Mary Gibbs'...",['2001']


In [814]:
# remove outside brackets (years)

movies_crewcast['years'] = movies_crewcast.years.astype(str)
movies_crewcast['years'] = movies_crewcast['years'].str.replace(']','').str.replace('[','').str.replace('\'', '')
movies_crewcast.head()

Unnamed: 0.1,Unnamed: 0,title,director,actors,years
0,0,['Toy Story 4'],['Josh Cooley'],"['Tom Hanks', 'Tim Allen', 'Annie Potts', 'Ton...",2019
1,1,['The Lion King'],"['Roger Allers', 'Rob Minkoff']","['Matthew Broderick', 'Jeremy Irons', 'James E...",1994
2,2,['Toy Story 3'],['Lee Unkrich'],"['Tom Hanks', 'Tim Allen', 'Joan Cusack', 'Ned...",2010
3,3,['Finding Nemo'],"['Andrew Stanton', 'Lee Unkrich']","['Albert Brooks', 'Ellen DeGeneres', 'Alexande...",2003
4,4,"['Monsters, Inc.']","['Pete Docter', 'David Silverman', 'Lee Unkrich']","['Billy Crystal', 'John Goodman', 'Mary Gibbs'...",2001


In [815]:
# remove outside brackets and quotes (title)

movies_crewcast['title'] = movies_crewcast.title.astype(str)
movies_crewcast['title'] = movies_crewcast['title'].str.replace(']','').str.replace('[','').str.replace('\'', '').str.replace('"', '')
movies_crewcast.head()

Unnamed: 0.1,Unnamed: 0,title,director,actors,years
0,0,Toy Story 4,['Josh Cooley'],"['Tom Hanks', 'Tim Allen', 'Annie Potts', 'Ton...",2019
1,1,The Lion King,"['Roger Allers', 'Rob Minkoff']","['Matthew Broderick', 'Jeremy Irons', 'James E...",1994
2,2,Toy Story 3,['Lee Unkrich'],"['Tom Hanks', 'Tim Allen', 'Joan Cusack', 'Ned...",2010
3,3,Finding Nemo,"['Andrew Stanton', 'Lee Unkrich']","['Albert Brooks', 'Ellen DeGeneres', 'Alexande...",2003
4,4,"Monsters, Inc.","['Pete Docter', 'David Silverman', 'Lee Unkrich']","['Billy Crystal', 'John Goodman', 'Mary Gibbs'...",2001


In [816]:
movies_crewcast.set_index('title', inplace=True) #indexing by movie title

movies_crewcast

Unnamed: 0_level_0,Unnamed: 0,director,actors,years
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Toy Story 4,0,['Josh Cooley'],"['Tom Hanks', 'Tim Allen', 'Annie Potts', 'Ton...",2019
The Lion King,1,"['Roger Allers', 'Rob Minkoff']","['Matthew Broderick', 'Jeremy Irons', 'James E...",1994
Toy Story 3,2,['Lee Unkrich'],"['Tom Hanks', 'Tim Allen', 'Joan Cusack', 'Ned...",2010
Finding Nemo,3,"['Andrew Stanton', 'Lee Unkrich']","['Albert Brooks', 'Ellen DeGeneres', 'Alexande...",2003
"Monsters, Inc.",4,"['Pete Docter', 'David Silverman', 'Lee Unkrich']","['Billy Crystal', 'John Goodman', 'Mary Gibbs'...",2001
...,...,...,...,...
Hannibal Rising,4359,['Peter Webber'],"['Gaspard Ulliel', 'Rhys Ifans', 'Gong Li', 'A...",2007
Cadillac Man,4360,['Roger Donaldson'],"['Robin Williams', 'Tim Robbins', 'Pamela Reed...",1990
Do the Right Thing,4361,['Spike Lee'],"['Danny Aiello', 'Ossie Davis', 'Ruby Dee', 'R...",1989
Poetic Justice,4362,['John Singleton'],"['Janet Jackson', 'Tupac Shakur', 'Regina King...",1993


In [817]:
movies_crewcast.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4364 entries, Toy Story 4 to Three Days of the Condor
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  4364 non-null   int64 
 1   director    4364 non-null   object
 2   actors      4364 non-null   object
 3   years       4364 non-null   object
dtypes: int64(1), object(3)
memory usage: 170.5+ KB


In [818]:
# remove outside brackets and quotes (directors)
movies_crewcast['director'] = movies_crewcast.director.astype(str)
movies_crewcast['director'] = movies_crewcast['director'].str.replace(']','').str.replace('[','').str.replace('\'', '').str.replace('"', '')
movies_crewcast.head()

Unnamed: 0_level_0,Unnamed: 0,director,actors,years
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Toy Story 4,0,Josh Cooley,"['Tom Hanks', 'Tim Allen', 'Annie Potts', 'Ton...",2019
The Lion King,1,"Roger Allers, Rob Minkoff","['Matthew Broderick', 'Jeremy Irons', 'James E...",1994
Toy Story 3,2,Lee Unkrich,"['Tom Hanks', 'Tim Allen', 'Joan Cusack', 'Ned...",2010
Finding Nemo,3,"Andrew Stanton, Lee Unkrich","['Albert Brooks', 'Ellen DeGeneres', 'Alexande...",2003
"Monsters, Inc.",4,"Pete Docter, David Silverman, Lee Unkrich","['Billy Crystal', 'John Goodman', 'Mary Gibbs'...",2001


In [819]:
# remove outside brackets and quotes (actors)
movies_crewcast['actors'] = movies_crewcast.actors.astype(str)
movies_crewcast['actors'] = movies_crewcast['actors'].str.replace(']','').str.replace('[','').str.replace('\'', '').str.replace('"', '')
movies_crewcast.head()

Unnamed: 0_level_0,Unnamed: 0,director,actors,years
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Toy Story 4,0,Josh Cooley,"Tom Hanks, Tim Allen, Annie Potts, Tony Hale",2019
The Lion King,1,"Roger Allers, Rob Minkoff","Matthew Broderick, Jeremy Irons, James Earl Jo...",1994
Toy Story 3,2,Lee Unkrich,"Tom Hanks, Tim Allen, Joan Cusack, Ned Beatty",2010
Finding Nemo,3,"Andrew Stanton, Lee Unkrich","Albert Brooks, Ellen DeGeneres, Alexander Goul...",2003
"Monsters, Inc.",4,"Pete Docter, David Silverman, Lee Unkrich","Billy Crystal, John Goodman, Mary Gibbs, Steve...",2001


In [820]:
movies_crewcast.columns

Index(['Unnamed: 0', 'director', 'actors', 'years'], dtype='object')

In [630]:
"""movies_list = []   #<- make an empty list
for url in urls:   #<- url list
    movies_list.append(get_movie_data(url)) #4:40 - 4:55"""

In [632]:
#movies_list

[{'title': 'Avengers: Endgame',
  'budget': 356000000,
  'runtime': 181,
  'rating': 'PG-13',
  'genres': 'Action Adventure Drama Sci-Fi',
  'distributor': 'Walt Disney Studios Motion Pictures',
  'domestic_gross': 858373000,
  'worldwide_gross': 2797501328,
  'opening_gross': 357115007,
  'widest_release': 4662,
  'release_date': datetime.datetime(2019, 4, 26, 0, 0)},
 {'title': 'The Lion King',
  'budget': 260000000,
  'runtime': 118,
  'rating': 'PG',
  'genres': 'Adventure Animation Drama Family Musical',
  'distributor': 'Walt Disney Studios Motion Pictures',
  'domestic_gross': 543638043,
  'worldwide_gross': 1656943394,
  'opening_gross': 191770759,
  'widest_release': 4802,
  'release_date': datetime.datetime(2019, 7, 19, 0, 0)},
 {'title': 'Toy Story 4',
  'budget': 200000000,
  'runtime': 100,
  'rating': 'G',
  'genres': 'Adventure Animation Comedy Family Fantasy',
  'distributor': 'Walt Disney Studios Motion Pictures',
  'domestic_gross': 434038008,
  'worldwide_gross': 107

In [633]:
#master_df = pd.DataFrame(movies_list)  #convert list of dict to df
#master_df.set_index('title', inplace=True) #indexing by movie title

#master_df

Unnamed: 0_level_0,budget,runtime,rating,genres,distributor,domestic_gross,worldwide_gross,opening_gross,widest_release,release_date
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Avengers: Endgame,356000000.0,181.0,PG-13,Action Adventure Drama Sci-Fi,Walt Disney Studios Motion Pictures,858373000,2797501328,357115007.0,4662,2019-04-26
The Lion King,260000000.0,118.0,PG,Adventure Animation Drama Family Musical,Walt Disney Studios Motion Pictures,543638043,1656943394,191770759.0,4802,2019-07-19
Toy Story 4,200000000.0,100.0,G,Adventure Animation Comedy Family Fantasy,Walt Disney Studios Motion Pictures,434038008,1073394593,120908065.0,4575,2019-06-21
Frozen II,150000000.0,103.0,PG,Adventure Animation Comedy Family Fantasy Musical,Walt Disney Studios Motion Pictures,477373578,1450026933,130263358.0,4440,2019-11-22
Captain Marvel,160000000.0,123.0,PG-13,Action Adventure Sci-Fi,Walt Disney Studios Motion Pictures,426829839,1128274794,153433423.0,4310,2019-03-08
...,...,...,...,...,...,...,...,...,...,...
Truth,,125.0,R,Biography Drama History Thriller,Sony Pictures Classics,2541854,5383097,66232.0,1122,2015-10-16
Mistress America,,84.0,R,Comedy Drama,Fox Searchlight Pictures,2500431,3340737,93206.0,512,2015-08-14
Pawn Sacrifice,,115.0,PG-13,Biography Drama Sport Thriller,Bleecker Street Media,2436633,5578519,202053.0,781,2015-09-16
The Oscar Nominated Short Films 2015: Live Action,,118.0,,Drama,Shorts International,2412593,2412593,416850.0,280,2015-01-30


In [634]:
#df1 = master_df.to_csv('/Users/hernantrujillo/Documents/Metis/NBM_Regression_Student-main/Movies_Project_Hernan/data_project/df1.csv') 

### Daytime (Seasons)

In [42]:
df1 = pd.read_csv('/Users/hernantrujillo/Documents/Metis/NBM_Regression_Student-main/Movies_Project_Hernan/data_project/df1.csv') 

In [821]:
df1.shape

(580, 12)

In [44]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            1000 non-null   object 
 1   budget           596 non-null    float64
 2   runtime          972 non-null    float64
 3   rating           949 non-null    object 
 4   genres           1000 non-null   object 
 5   distributor      999 non-null    object 
 6   domestic_gross   1000 non-null   int64  
 7   worldwide_gross  1000 non-null   int64  
 8   opening_gross    992 non-null    float64
 9   widest_release   1000 non-null   int64  
 10  release_date     1000 non-null   object 
dtypes: float64(3), int64(3), object(5)
memory usage: 86.1+ KB


In [45]:
#using .isna() result as data mask

df1.isna().sum() 

title                0
budget             404
runtime             28
rating              51
genres               0
distributor          1
domestic_gross       0
worldwide_gross      0
opening_gross        8
widest_release       0
release_date         0
dtype: int64

In [46]:
df1.head()

Unnamed: 0,title,budget,runtime,rating,genres,distributor,domestic_gross,worldwide_gross,opening_gross,widest_release,release_date
0,Avengers: Endgame,356000000.0,181.0,PG-13,Action Adventure Drama Sci-Fi,Walt Disney Studios Motion Pictures,858373000,2797501328,357115007.0,4662,2019-04-26
1,The Lion King,260000000.0,118.0,PG,Adventure Animation Drama Family Musical,Walt Disney Studios Motion Pictures,543638043,1656943394,191770759.0,4802,2019-07-19
2,Toy Story 4,200000000.0,100.0,G,Adventure Animation Comedy Family Fantasy,Walt Disney Studios Motion Pictures,434038008,1073394593,120908065.0,4575,2019-06-21
3,Frozen II,150000000.0,103.0,PG,Adventure Animation Comedy Family Fantasy Musical,Walt Disney Studios Motion Pictures,477373578,1450026933,130263358.0,4440,2019-11-22
4,Captain Marvel,160000000.0,123.0,PG-13,Action Adventure Sci-Fi,Walt Disney Studios Motion Pictures,426829839,1128274794,153433423.0,4310,2019-03-08


In [47]:
df1['release_date'] =  pd.to_datetime(df1['release_date'], infer_datetime_format=True)

In [48]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   title            1000 non-null   object        
 1   budget           596 non-null    float64       
 2   runtime          972 non-null    float64       
 3   rating           949 non-null    object        
 4   genres           1000 non-null   object        
 5   distributor      999 non-null    object        
 6   domestic_gross   1000 non-null   int64         
 7   worldwide_gross  1000 non-null   int64         
 8   opening_gross    992 non-null    float64       
 9   widest_release   1000 non-null   int64         
 10  release_date     1000 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(3), object(4)
memory usage: 86.1+ KB


In [49]:
#now, let's group datetime by season so we can use it as a feature

def date_to_season(date):
    Winter = [12, 1, 2]
    Spring = [3, 4, 5]
    Summer = [6, 7, 8]
    Fall = [9, 10, 11]
    if date.month in Winter:
        return "Winter"
    elif date.month in Spring:
        return "Spring"
    elif date.month in Summer:
        return "Summer"
    else:
        return "Fall"


In [50]:
df1['season'] = df1['release_date'].apply(date_to_season)

In [51]:
df1.head()

Unnamed: 0,title,budget,runtime,rating,genres,distributor,domestic_gross,worldwide_gross,opening_gross,widest_release,release_date,season
0,Avengers: Endgame,356000000.0,181.0,PG-13,Action Adventure Drama Sci-Fi,Walt Disney Studios Motion Pictures,858373000,2797501328,357115007.0,4662,2019-04-26,Spring
1,The Lion King,260000000.0,118.0,PG,Adventure Animation Drama Family Musical,Walt Disney Studios Motion Pictures,543638043,1656943394,191770759.0,4802,2019-07-19,Summer
2,Toy Story 4,200000000.0,100.0,G,Adventure Animation Comedy Family Fantasy,Walt Disney Studios Motion Pictures,434038008,1073394593,120908065.0,4575,2019-06-21,Summer
3,Frozen II,150000000.0,103.0,PG,Adventure Animation Comedy Family Fantasy Musical,Walt Disney Studios Motion Pictures,477373578,1450026933,130263358.0,4440,2019-11-22,Fall
4,Captain Marvel,160000000.0,123.0,PG-13,Action Adventure Sci-Fi,Walt Disney Studios Motion Pictures,426829839,1128274794,153433423.0,4310,2019-03-08,Spring


In [52]:
#set the title as index

df1.set_index('title', inplace=True)

In [53]:
df1.head()

Unnamed: 0_level_0,budget,runtime,rating,genres,distributor,domestic_gross,worldwide_gross,opening_gross,widest_release,release_date,season
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Avengers: Endgame,356000000.0,181.0,PG-13,Action Adventure Drama Sci-Fi,Walt Disney Studios Motion Pictures,858373000,2797501328,357115007.0,4662,2019-04-26,Spring
The Lion King,260000000.0,118.0,PG,Adventure Animation Drama Family Musical,Walt Disney Studios Motion Pictures,543638043,1656943394,191770759.0,4802,2019-07-19,Summer
Toy Story 4,200000000.0,100.0,G,Adventure Animation Comedy Family Fantasy,Walt Disney Studios Motion Pictures,434038008,1073394593,120908065.0,4575,2019-06-21,Summer
Frozen II,150000000.0,103.0,PG,Adventure Animation Comedy Family Fantasy Musical,Walt Disney Studios Motion Pictures,477373578,1450026933,130263358.0,4440,2019-11-22,Fall
Captain Marvel,160000000.0,123.0,PG-13,Action Adventure Sci-Fi,Walt Disney Studios Motion Pictures,426829839,1128274794,153433423.0,4310,2019-03-08,Spring
