### Import modules

In [1]:
from bs4 import BeautifulSoup
from requests import get
import re
import pandas as pd
import numpy as np
import json
import datetime

### Web scrape IMDB for list of movies being screened locally

In [2]:
moviesScreeningUrl = 'https://www.imdb.com/showtimes/location?'
moviesScreeningResponse = get(moviesScreeningUrl)
moviesScreening = BeautifulSoup(moviesScreeningResponse.text, 'html.parser')

# Selecting all movies on webpage
moviesScreening = moviesScreening.find_all('div',
                                           attrs={'class':'lister-item mode-grid'})

### Create list with data (movie title, user rating, Metascore, genres, release date) scraped from IMDB

In [3]:
moviesScreeningList = []
# Using regex to extract the movie release date from the webpage
datePattern = re.compile('[0-9]\d{1}.[A-z]\w{2}.[0-9)]\d{3}')
for movie in moviesScreening:
    title = movie.find_all('a')[1].string
    userRating = movie.find_all('span',
                                attrs={'name':'user_rating'})[0]
    userRating = str(userRating).strip('<span data-value="').strip('" name="user_rating"></span>')
    try:
        imdbGenres = movie.find_all('span',
                                attrs={'class':'genre'})[0].text.strip('\n')
    except:
        imdbGenres = None
#     Using these try blocks to prevent an error from breaking the program.
    releaseDate = movie.find_all('div',
                                 attrs={'id':'release_date'})[0].text
    try:
        releaseDate = datePattern.search(releaseDate).group(0)
    except:
        releaseDate = np.nan
    try:
        metaScore = movie.find_all('div',attrs={'class':'inline-block ratings-metascore'})[0]\
                                        .text.strip('        \n        Metascore\n            ')
    except:
        metaScore = np.nan
    moviesScreeningList.append([title,userRating,metaScore,imdbGenres,releaseDate])

### Web scrape Flixter for Rotten Tomatoes ratings

In [4]:
rtRatingsUrl = 'https://www.flixster.com/top-box-office'
rtRatingsResponse = get(rtRatingsUrl)
rtRatings = BeautifulSoup(rtRatingsResponse.text, 
                          'html.parser')

In [5]:
#Using regex to find section of code containing the movie data
rgPattern = re.compile("(\{.*\})")
rtScript = rtRatings.find_all("script")
movieData = rgPattern.search(str(rtScript)).group(0)

# This section of the script is in JSON format,
# so I'm converting the text to JSON to make it easier to extract the data
movieData = json.loads(movieData)
movieData = movieData['props']['initialReduxState']['boxOffice']['top-box-office']['data']['movies']

### Create list with data (movie title, critic score, audience score) scraped from Flixter

In [6]:
rtMovieList=[]
for movie in movieData:
    title = movie['title']
    try:
        criticScore = movie['criticsRating']['score']
#     See comment above about use of try blocks
    except:
        criticScore = np.nan
    try:
        audienceScore = movie['userRating']['score']
    except:
        audienceScore = np.nan
    rtMovieList.append([title,criticScore,audienceScore])

### Create dataframes for analysis

In [7]:
# Creating a data frame with the IMDB data
screeningsDF = pd.DataFrame(data = moviesScreeningList,
                            columns=['Title','IMDB user rating','Metascore','Genres','Release date'])\
                            .set_index('Title')

# Creating a data frame with the Flixter data
ratingsDF = pd.DataFrame(data = rtMovieList,
                         columns=['Title','RT Critics score','RT Audience score'])\
                        .set_index('Title')

# Merging the above dataframes
moviesDF = screeningsDF.merge(ratingsDF, 
                              on='Title',
                              how='left')

### Update columns to the correct data type to allow for calculations

In [8]:
moviesDF['RT Critics score'] = moviesDF['RT Critics score'].astype(float)
moviesDF['RT Audience score'] = moviesDF['RT Audience score'].astype(float)
moviesDF['IMDB user rating'] = moviesDF['IMDB user rating'].astype(float)
moviesDF['Metascore'] = moviesDF['Metascore'].astype(float)
moviesDF['Release date'] = pd.to_datetime(moviesDF['Release date']).dt.date

### Filter the dataframe to select the top movies

<font color=orange size = 3> Update the filter settings in the cell below based on your personal preferences. You can set the threshold scores that are used to classify a movie as critically acclaimed or an audience favorite.</font>

In [9]:
# Threshold scores for critically acclaimed criteria
RTCriticScore = 85 
Metascore = 80

# Threshold scores for audience favorite criteria
RTAudienceScore = 85
IMDBRating = 8

In [10]:
moviesDF['Critically acclaimed'] = (((moviesDF['RT Critics score'] >= RTCriticScore) 
                                     & (moviesDF.Metascore >= Metascore)) 
                                    | ((moviesDF['RT Critics score'].isna()) 
                                       & (moviesDF.Metascore >= Metascore)))
moviesDF['Critically acclaimed'] = np.where(moviesDF['Critically acclaimed'] == True,1,0)


moviesDF['Audience favorite'] = (((moviesDF['RT Audience score'] >= RTAudienceScore) 
                                  & (moviesDF['IMDB user rating'] >= IMDBRating)) 
                                    | ((moviesDF['RT Audience score'].isna()) 
                                       & (moviesDF['IMDB user rating'] >= IMDBRating)))
moviesDF['Audience favorite'] = np.where(moviesDF['Audience favorite'] == True,1,0)

# Only movies that are critically acclaimed or audience favorites are included in the list
topMovies = moviesDF[(moviesDF['Critically acclaimed'] == True) | (moviesDF['Audience favorite'] == True)].copy()

### Create boolean columns for movie genres and scoring

##### List of available genres displayed below

In [11]:
genres = []
for movie in moviesDF['Genres']:
    genres = [genre.strip(' ') for genre in genres + movie.strip().split(',')]
set(genres)

{'Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Sport',
 'Thriller',
 'War'}

<font color=orange size=3>Include genres that you like (from above list) to the list below, in order of preference</font>

In [12]:
selectedGenres = ['Animation','Comedy','Music']

In [13]:
for genre in selectedGenres:
    topMovies[genre] = (topMovies['Genres'].str.contains(genre))
    topMovies[genre] = np.where(topMovies[genre] == True,1,0)

# Assigning a total score based on above boolean columns
topMovies['Total score'] = (topMovies['Critically acclaimed'] 
                            + topMovies['Audience favorite'] 
                            +topMovies[selectedGenres].sum(axis=1))

### List of top movies

In [14]:
displayColumns = ['RT Audience score','RT Critics score','IMDB user rating','Metascore','Critically acclaimed',
                  'Audience favorite','Total score']
displayColumns[6:6] = selectedGenres
highlightedColumns = ['Total score','Audience favorite','Critically acclaimed'] + selectedGenres

# Sorting the data frame and highlighting the boolean columns that are True (value = 1)
topMovies[displayColumns]\
            .sort_values(highlightedColumns,
                         ascending=False)\
                        .style.highlight_max(subset=highlightedColumns, 
                                             color='lightblue')

Unnamed: 0_level_0,RT Audience score,RT Critics score,IMDB user rating,Metascore,Critically acclaimed,Audience favorite,Animation,Comedy,Music,Total score
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Toy Story 4,94.0,97.0,8.1,84,1,1,1,1,0,4
Once Upon a Time in Hollywood,,,8.0,83,1,1,0,1,0,3
Parasite,,,8.5,92,1,1,0,1,0,3
The Farewell,87.0,99.0,8.1,89,1,1,0,1,0,3
Apocalypse Now,,,8.4,94,1,1,0,0,0,2
Apollo 11,,,8.3,88,1,1,0,0,0,2
Amazing Grace,,,7.6,94,1,0,0,0,1,2


### Movies loved by critics and audiences

In [15]:
topMovies[['Genres','Release date']]\
        [(topMovies['Critically acclaimed'] == 1) 
         & (topMovies['Audience favorite'] == 1)]\
        .sort_values(['Release date'],
                     ascending=False)

Unnamed: 0_level_0,Genres,Release date
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
The Farewell,"Comedy, Drama",2019-09-05
Once Upon a Time in Hollywood,"Comedy, Drama",2019-08-15
Apollo 11,"Documentary, History",2019-06-26
Toy Story 4,"Animation, Adventure, Comedy",2019-06-20
Parasite,"Comedy, Drama, Thriller",2019-05-30
Apocalypse Now,"Drama, Mystery, War",1979-11-15


### Movies loved by critics but not audiences

In [16]:
topMovies[['Genres','Release date']]\
        [(topMovies['Critically acclaimed'] == 1) 
         & (topMovies['Audience favorite'] == 0)]\
        .sort_values(['Release date'],
                     ascending=False)

Unnamed: 0_level_0,Genres,Release date
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Amazing Grace,"Documentary, Music",2019-08-29


### Movies loved by audiences but not critics

In [17]:
topMovies[['Genres','Release date']]\
        [(topMovies['Critically acclaimed'] == 0) 
         & (topMovies['Audience favorite'] == 1)]\
        .sort_values(['Release date'],
                     ascending=False)

Unnamed: 0_level_0,Genres,Release date
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
