this Lab illustrates how to use `BeautifulSoup` to scrape the International Movies Database (IMDB) at [imdb.com](https://imdb.com) for top films released in year 2023 with the highest US box office. 

The final dataframe will contains the below elements:

* `name` - title of the movie, 
* `year` - release year of the movie, 
* `imdb` - IMDB score of the movie, 
* `m_score` - meta score of the movie, 
* `vote` - number of votes.

First, we import the requried packages

In [1]:
import bs4
import requests
import time
import random as ran
import sys
import pandas as pd

Now, search the [top 1000 films released in year of 2023 at imdb.com](https://www.imdb.com/search/title/?release_date=2023&sort=boxoffice_gross_us,desc&start=1) and scrape results from the first page

In [2]:
url = 'https://www.imdb.com/search/title?release_date=2023&sort=boxoffice_gross_us,desc&start=1'

source = requests.get(url).text
soup = bs4.BeautifulSoup(source,'html.parser')

Since above code extracts all data on the first page, below code is run only to extract movie information on it.

In [3]:
movie_blocks = soup.findAll('div',{'class':'lister-item-content'})
soup.findAll('div',{'class':'lister-item-content'})[0]

<div class="lister-item-content">
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt12338584/">R BnB</a>
<span class="lister-item-year text-muted unbold">(2023)</span>
</h3>
<p class="text-muted">
<span class="runtime">94 min</span>
<span class="ghost">|</span>
<span class="genre">
Drama, Romance, Thriller            </span>
</p>
<div class="ratings-bar">
<div class="inline-block ratings-imdb-rating" data-value="9" name="ir">
<span class="global-sprite rating-star imdb-rating"></span>
<strong>9.0</strong>
</div>
<div class="inline-block ratings-user-rating">
<span class="userRatingValue" data-tconst="tt12338584" id="urv_tt12338584">
<span class="global-sprite rating-star no-rating"></span>
<span class="rate" data-no-rating="Rate this" data-value="0" name="ur">Rate this</span>
</span>
<div class="starBarWidget" id="sb_tt12338584">
<div class="rating rating-list" data-csrf-token="" data-ga-identifier="" data-starbar-class="rat

Before extracting information across all movies, try first to examine one of the extracted block to identify the elements that we need to scrape.

Below  the elements from the first movie block are extracted

In [4]:
mname = movie_blocks[0].find('a').get_text() # Name of the movie

m_reyear = int(movie_blocks[0].find('span',{'class': 'lister-item-year'}).contents[0][1:-1]) # Release year
m_rating = float(movie_blocks[0].find('div',{'class':'inline-block ratings-imdb-rating'}).get('data-value')) #rating

#m_mscore = float(movie_blocks[0].find('span',{'class':'metascore favorable'}).contents[0].strip()) #meta score

m_votes = int(movie_blocks[0].find('span',{'name':'nv'}).get('data-value')) # votes
type_mv = movie_blocks[0].find('span',{'class':'genre'}).contents[0][1:-1].strip()
time = movie_blocks[0].find('span',{'class':'runtime'}).contents[0][0:-3].strip()


print("Movie Name: " + mname,
      "\nRelease Year: " + str(m_reyear),
      "\nIMDb Rating: " + str(m_rating),
      "\nVotes: " + '{:,}'.format(m_votes),
      "\nType: " + type_mv,
      "\nTime: "+time

)

Movie Name: R BnB 
Release Year: 2023 
IMDb Rating: 9.0 
Votes: 10 
Type: Drama, Romance, Thriller 
Time: 94


Once you examine the resulting pages of the imbd search that we initially did , it's obvious that by editing the html link it is possible to view all search results. Thus we will be using this feature during the scrape to iterate through all pages.

Now since scraping the data is an iterative process, we define separate functions for each purpose.

First wa are going to define a function which will extract the targeted elements from a 'movie block list' (discussed above)

In [5]:
def scrape_mblock(movie_block):
    movieb_data = {}

    try:
        movieb_data['name'] = movie_block.find('a').get_text() # Name of the movie
    except:
        movieb_data['name'] = None


    try:    
        movieb_data['year'] = str(movie_block.find('span', {'class': 'lister-item-year'}).contents[0][1:-1]) # Release year
    except:
        movieb_data['year'] = None

    try:
        movieb_data['rating'] = float(movie_block.find('div', {'class': 'inline-block ratings-imdb-rating'}).get('data-value')) # Rating
    except:
        movieb_data['rating'] = None

    try:
        movieb_data['m_score'] = float(movie_block.find('span', {'class': 'metascore favorable'}).contents[0].strip()) # Metascore
    except:
        movieb_data['m_score'] = None

    try:
        movieb_data['votes'] = int(movie_block.find('span', {'name': 'nv'}).get('data-value')) # Votes
    except:
        movieb_data['votes'] = None

    try:
        genre_span = movie_block.find('span', {'class': 'genre'})
        if genre_span:
            movieb_data['type'] = genre_span.contents[0][1:-1].strip() # Type/Genre
        else:
            movieb_data['type'] = None
    except:
        movieb_data['type'] = None

    return movieb_data


In [6]:
scrape_mblock(movie_blocks[4])

{'name': 'The Mother',
 'year': '2023',
 'rating': 5.5,
 'm_score': None,
 'votes': 27361,
 'type': 'Action, Thriller'}

Then we create the below function to scrape all movie blocks within a single search result page

In [7]:
def scrape_m_page(movie_blocks):
    
    page_movie_data = []
    num_blocks = len(movie_blocks)
    
    for block in range(num_blocks):
        page_movie_data.append(scrape_mblock(movie_blocks[block]))
    
    return page_movie_data

In [8]:
scrape_m_page(movie_blocks)

[{'name': 'R BnB',
  'year': '2023',
  'rating': 9.0,
  'm_score': None,
  'votes': 10,
  'type': 'Drama, Romance, Thriller'},
 {'name': 'Trinket Box',
  'year': '2023',
  'rating': 7.8,
  'm_score': None,
  'votes': 59,
  'type': 'Drama, Horror'},
 {'name': 'Guardians of the Galaxy Vol. 3',
  'year': '2023',
  'rating': 8.2,
  'm_score': 64.0,
  'votes': 142412,
  'type': 'Action, Adventure, Comedy'},
 {'name': 'Fast X',
  'year': '2023',
  'rating': 6.3,
  'm_score': None,
  'votes': 28482,
  'type': 'Action, Adventure, Crime'},
 {'name': 'The Mother',
  'year': '2023',
  'rating': 5.5,
  'm_score': None,
  'votes': 27361,
  'type': 'Action, Thriller'},
 {'name': 'Silo',
  'year': '2023– ',
  'rating': 8.2,
  'm_score': None,
  'votes': 15473,
  'type': 'Drama, Sci-Fi'},
 {'name': 'Air',
  'year': 'I) (2023',
  'rating': 7.5,
  'm_score': 73.0,
  'votes': 80400,
  'type': 'Drama, Sport'},
 {'name': 'Queen Cleopatra',
  'year': '2023',
  'rating': 1.0,
  'm_score': None,
  'votes': 74

Now we built functions to extract all movie data from a single page.

Next function will be created to iterate the above made function through all pages of the search result untill we scrape data for the targeted number of movies

In [11]:
def scrape_this(link,t_count):
    
    base_url = link
    target = t_count
    
    current_mcount_start = 0
    current_mcount_end = 0
    remaining_mcount = target - current_mcount_end 
    
    new_page_number = 1
    
    movie_data = []
    
    
    while remaining_mcount > 0:

        url = base_url + str(new_page_number)
        
        #set_trace()
        
        source = requests.get(url).text
        soup = bs4.BeautifulSoup(source,'html.parser')
        
        movie_blocks = soup.findAll('div',{'class':'lister-item-content'})
        
        movie_data.extend(scrape_m_page(movie_blocks))   
        
        current_mcount_start = int(soup.find("div", {"class":"nav"}).find("div", {"class": "desc"}).contents[1].get_text().split("-")[0])

        current_mcount_end = int(soup.find("div", {"class":"nav"}).find("div", {"class": "desc"}).contents[1].get_text().split("-")[1].split(" ")[0])

        remaining_mcount = target - current_mcount_end
        
        print('\r' + "currently scraping movies from: " + str(current_mcount_start) + " - "+str(current_mcount_end), "| remaining count: " + str(remaining_mcount), flush=True, end ="")
        
        new_page_number = current_mcount_end + 1
        
    
    return movie_data
    
    

Finally, we put together all functions created above to scrape the top 150 movies on the list

In [12]:
base_scraping_link = "https://www.imdb.com/search/title?release_date=2018-01-01,2018-12-31&sort=boxoffice_gross_us,desc&start="

top_movies = 150 #input("How many movies do you want to scrape?")
films = []

films = scrape_this(base_scraping_link,int(top_movies))

print('\r'+"List of top " + str(top_movies) +" movies:" + "\n", end="\n")
pd.DataFrame(films)

List of top 150 movies:es from: 101 - 150 | remaining count: 0



Unnamed: 0,name,year,rating,m_score,votes,type
0,Black Panther,2018,7.3,88.0,801612,"Action, Adventure, Sci-Fi"
1,Avengers: Infinity War,2018,8.4,68.0,1122455,"Action, Adventure, Sci-Fi"
2,Incredibles 2,2018,7.6,80.0,309271,"Animation, Action, Adventure"
3,Jurassic World: Fallen Kingdom,2018,6.1,,327176,"Action, Adventure, Sci-Fi"
4,Aquaman,2018,6.8,,490971,"Action, Adventure, Fantasy"
...,...,...,...,...,...,...
145,Boy Erased,2018,6.9,69.0,40301,"Biography, Drama"
146,Hotel Artemis,2018,6.1,,55605,"Action, Crime, Drama"
147,A-X-L,2018,5.3,,12550,"Action, Adventure, Drama"
148,Run the Race,2018,5.9,,1621,"Drama, Sport"


### Assignment: 

1. create a web app using Dash and Plotly
2. scrap the content of your choice (example: top 250, Top box office, or the results of your own query)
3. visualize your results through multiple charts as we did with worldometers website 
4. try to create your own charts based on the choosed content 

In [156]:
import bs4
import requests
import time
import random as ran
import pandas as pd
import dash
import dash_core_components as dcc
import dash_html_components as html
import plotly.graph_objects as go

def scrape_mblock(movie_block):
    movieb_data = {}

    try:
        movieb_data['name'] = movie_block.find('a').get_text()  # Name of the movie
    except:
        movieb_data['name'] = None

    try:
        movieb_data['year'] = str(movie_block.find('span', {'class': 'lister-item-year'}).contents[0][1:-1])  # Release year
    except:
        movieb_data['year'] = None

    try:
        movieb_data['rating'] = float(movie_block.find('div', {'class': 'inline-block ratings-imdb-rating'}).get(
            'data-value'))  # Rating
    except:
        movieb_data['rating'] = None

    try:
        movieb_data['m_score'] = float(
            movie_block.find('span', {'class': 'metascore favorable'}).contents[0].strip())  # Metascore
    except:
        movieb_data['m_score'] = None

    try:
        movieb_data['votes'] = int(movie_block.find('span', {'name': 'nv'}).get('data-value'))  # Votes
    except:
        movieb_data['votes'] = None

    try:
        genre_span = movie_block.find('span', {'class': 'genre'})
        if genre_span:
            movieb_data['types'] = [t.strip() for t in genre_span.contents[0][1:-1].split(',')]  # Types/Genres
        else:
            movieb_data['types'] = []
    except:
        movieb_data['types'] = []
    try:
        movieb_data['runtime'] = int(movie_block.find('span', {'class': 'runtime'}).contents[0][0:-3].strip())  # Runtime
    except:
        movieb_data['runtime'] = None

    return movieb_data


def scrape_m_page(movie_blocks):
    page_movie_data = []
    num_blocks = len(movie_blocks)

    for block in range(num_blocks):
        page_movie_data.append(scrape_mblock(movie_blocks[block]))

    return page_movie_data


def scrape_this(link, t_count):
    base_url = link
    target = t_count

    current_mcount_start = 0
    current_mcount_end = 0
    remaining_mcount = target - current_mcount_end
    new_page_number = 1
    movie_data = []

    while remaining_mcount > 0:
        url = base_url + str(new_page_number)
        source = requests.get(url).text
        soup = bs4.BeautifulSoup(source, 'html.parser')
        movie_blocks = soup.findAll('div', {'class': 'lister-item-content'})
        movie_data.extend(scrape_m_page(movie_blocks))
        current_mcount_start = int(
            soup.find("div", {"class": "nav"}).find("div", {"class": "desc"}).contents[1].get_text().split("-")[0])
        current_mcount_end = int(
            soup.find("div", {"class": "nav"}).find("div", {"class": "desc"}).contents[1].get_text().split("-")[1].split(
                " ")[0])
        remaining_mcount = target - current_mcount_end

        print('\r' + "currently scraping movies from: " + str(
            current_mcount_start) + " - " + str(current_mcount_end) + " | remaining count: " + str(
            remaining_mcount), flush=True, end="")

        new_page_number = current_mcount_end + 1
        time.sleep(ran.randint(0, 10))

    return movie_data


base_scraping_link = "https://www.imdb.com/search/title?release_date=2018-01-01,2018-12-31&sort=boxoffice_gross_us,desc&start="
top_movies = 250  
films = scrape_this(base_scraping_link, int(top_movies))
df_movies = pd.DataFrame(films)

# Create a new dataframe to count movie types
type_counts = df_movies['types'].apply(pd.Series).stack().value_counts()

# Calculate the average rating and runtime by film type
avg_rating_by_type = df_movies.explode('types').groupby('types')['rating'].mean().reset_index()
avg_runtime_by_type = df_movies.explode('types').groupby('types')['runtime'].mean().reset_index()



# Calculate the percentage of films based on rating categories
rating_categories = ['<3', '3-6', '6-8', '>=8']
rating_counts = pd.cut(df_movies['rating'], bins=[0, 3, 6, 8, 10], labels=rating_categories).value_counts(normalize=True) * 100

app = dash.Dash(__name__)

app.layout = html.Div(children=[
    html.H1(
        children='Movie Dashboard',
        style={'text-align': 'center', 'font-family': 'Arial', 'font-size': '36px', 'margin-bottom': '20px'}
    ),
    html.H2(children='Select Best Films'),

html.Div(
    children=[
        html.Div(
            children=[
                html.Label('Select Number of Films:'),
                dcc.Dropdown(
                    id='num-films-dropdown',
                    options=[
                        {'label': '5', 'value': 5},
                        {'label': '10', 'value': 10},
                        {'label': '25', 'value': 25},
                        {'label': '50', 'value': 50},
                        {'label': '100', 'value': 100}  
                    ],
                    value=25
                ),
            ],
            style={'margin-right': '20px'}
        ),

        html.Div(
            children=[
                html.Label('Select Sort Order:'),
                dcc.Dropdown(
                    id='sort-order-dropdown',
                    options=[
                        {'label': 'Ascending', 'value': 'ascending'},
                        {'label': 'Descending', 'value': 'descending'}
                    ],
                    value='descending'
                ),
            ]
        ),
    ],
    style={'display': 'flex', 'margin-bottom': '20px'}
),

    dcc.Graph(id='rating-bar-plot'),
    dcc.Graph(
        id='rating-pie-chart',
        figure={
            'data': [go.Pie(
                labels=rating_counts.index,
                values=rating_counts.values,
                marker=dict(colors=['#FF595E', '#FFCA3A', '#8AC926', '#1982C4']),
                hole=0.4,
                textinfo='label+percent',
                hoverinfo='value+percent',
                sort=False
            )],
            'layout': {
                'title': 'Percentage of Films by Rating Category',
                'legend': {'orientation': 'h', 'x': 0.5, 'y': 1.1}
            }
        }
    ),

    dcc.Graph(
        id='type-bar-chart',
        figure={
            'data': [go.Bar(x=type_counts.index, y=type_counts.values)],
            'layout': {
                'title': 'Number of Films by Type',
                'xaxis': {'title': 'Type'},
                'yaxis': {'title': 'Number of Films'}
            }
        }
    ),
        
dcc.Graph(
        id='avg-runtime-by-type-bar',
        figure={
            'data': [
                go.Bar(
                    x=avg_runtime_by_type['types'],
                    y=avg_runtime_by_type['runtime'],
                    name='Average Runtime'
                ),
                go.Bar(
                    x=avg_rating_by_type['types'],
                    y=avg_rating_by_type['rating'],
                    name='Average Rating'
                )
            ],
            'layout': go.Layout(
                title='Average Rating and Runtime by Film Type',
                xaxis=dict(title='Film Type'),
                yaxis=dict(title='Value'),
                barmode='group'
            )
        }
    ),

    dcc.Graph(
        id='pie-chart',
        figure={
            'data': [go.Pie(
                labels=type_counts.index,
                values=type_counts.values,
                hole=0.3,
                hoverinfo='label+percent'
            )],
            'layout': {
                'title': 'Genre Distribution',
            }
        }
    ),
    
    
])


@app.callback(
    dash.dependencies.Output('rating-bar-plot', 'figure'),
    [dash.dependencies.Input('num-films-dropdown', 'value'),
     dash.dependencies.Input('sort-order-dropdown', 'value')]
)

def update_rating_bar_plot(num_films, sort_order):
    if sort_order == 'ascending':
        sorted_df = df_movies.nsmallest(num_films, 'rating')
    else:
        sorted_df = df_movies.nlargest(num_films, 'rating')

    return {
        'data': [go.Bar(x=sorted_df['name'], y=sorted_df['rating'], orientation='v')],
        'layout': {
            'title': 'Top Films by Rating',
            'xaxis': {'title': 'Film'},
            'yaxis': {'title': 'Rating'},
            'barmode': 'group',
            'height': 400
        }
    }



@app.callback(
    dash.dependencies.Output('output-container', 'children'),
    [dash.dependencies.Input('rating-bar-plot', 'selectedData')]
)
def update_output(selected_data):
    if selected_data is not None and 'points' in selected_data:
        selected_films = selected_data['points']
        selected_films = sorted(selected_films, key=lambda x: x['x'], reverse=True)
        selected_films_names = [film['y'] for film in selected_films]

        return html.Div([
            html.H3('Selected Films:'),
            html.Ul([html.Li(film_name) for film_name in selected_films_names])
        ])
    else:
        return ''



if __name__ == '__main__':
    app.run_server(port=8051)


currently scraping movies from: 201 - 250 | remaining count: 000Dash is running on http://127.0.0.1:8051/

 * Serving Flask app '__main__'
 * Debug mode: off
