# Web Scraping Data From AlloCiné.fr

This script builds a DataFrame by web scraping the data from AlloCiné — a company which provides information on French cinema. Because of the long delay, we choose to scrape the data in two steps : 
- First we scrape the url of each movie with `getMoviesUrl()`
- Lastly we use the url list to scrape the data for each movie with `ScrapeURL()`

*Note : We use the popular BeautifulSoup package*

## Functions :

### `getMoviesUrl(start_page, end_page)` :

Save a CSV files of the url list as `movie_url.csv`. The argument must be integers and are used to select the range of page you want to scrape the data from. The `end_page` is not include.

### `ScrapeURL(movie_url)` :

Iterate over the list of url generate by `getMoviesUrl()` and scrape the data for each movie. In the process, we extract :

- `movie_title` : the movies title (in french)
- `release_date`: the original release date
- `re_release_date`: the re-release date
- `duration`: the movies length
- `genre` : the movies types (as an array, up to three different types)
- `directors` : movies directors (as an array)
- `actors` : main characters of the movies (as an array)
- `nationality`: nationality of the movies (as an array)
- `press_rating`: press ratings (from 0 to 5 stars)
- `nber_press_vote`: number of press votes
- `user_rating`:  AlloCiné users ratings (from 0 to 5 stars)
- `nber_user_vote`: number of users votes

The function `ScrapeURL()` returns two objects : the data as a dataframe and the url list of error as a list. In addition the two objects are saved as `allocine_movies.csv` and `allocine_errors.csv`. You could pass the list of errors into `ScrapeURL()` to get the extra data.

In [1]:
# Import libs
import pandas as pd
import numpy as np
from requests import get
from time import time
from time import sleep
from random import randint
from bs4 import BeautifulSoup
import dateparser

from warnings import warn
from IPython.core.display import clear_output
import traceback

In [2]:
# Function to scrape the movies urls from http://www.allocine.fr/films/
# Choose the page range with the two parameters start_page and end_page.
# The url list is save as a csv file: movie_url.csv
def getMoviesUrl(start_page, end_page):
    # Set the list
    movie_url = []
    
    # Preparing the setting and monitoring of the loop
    start_time = time()
    p_requests = start_page
    m_requests = 0
        
    for p in range(start_page, end_page):

        # Get request
        url = 'http://www.allocine.fr/films/?page={}'.format(str(p))
        response = get(url)
        
        # Pause the loop
        sleep(randint(1,2))
            
        # Monitoring the requests
        elapsed_time = time() - start_time
        print('Page Request: {}; Frequency: {} requests/s'.format(p_requests, p_requests/elapsed_time))
        clear_output(wait = True)
            
        # Warning for non-200 status codes
        if response.status_code != 200:
            warn('Page Request: {}; Status code: {}'.format(p_requests, response.status_code))

        # Break the loop if the number of requests is greater than expected
        if p_requests > end_page:
            warn('Number of requests was greater than expected.')
            break

        # Parse the content of the request with BeautifulSoup
        html_soup = BeautifulSoup(response.text, 'html.parser')

        # Select all the movies url from a single page
        movies = html_soup.find_all('h2', 'meta-title')
        m_requests += len(movies)
        
        # Monitoring the requests
        print('Page Request: {}; Movie Request: {}'.format(p_requests, m_requests))
        clear_output(wait = True)
        
        # Pause the loop
        sleep(1)
        
        for movie in movies:
            movie_url.append('http://www.allocine.fr{}'.format(movie.a['href']))
        
        p_requests += 1
    
    # Saving the files
    r = np.asarray(movie_url)
    np.savetxt("movie_url.csv", r, delimiter=",", fmt='%s')

In [4]:
# We use it to scrape the first 3999 pages
getMoviesUrl(1, 4000)

Page Request: 3999; Movie Request: 59985


In [None]:
# Function to scrape the data from the movies urls
# The function return a dataframe and a list of url that return error.
# And save them into csv files (allocine_movies.csv and allocine_errors.csv)
def ScrapeURL(movie_url):
    
    # init the dataframe
    c = ['title', 'date_reprise', 'date_sortie', 'duration', 'director', 'actor', 'genre', 'nationality', 
     'press_rating', 'nb_press', 'spec_rating', 'nb_spec']
    df = pd.DataFrame(columns=c)
    
    # preparing the setting and monitoring loop
    start_time = time()
    n_request = 0
    
    # init list to save errors
    errors = []
    
    # request loop
    for url in movie_url:
        try :
            response = get(url)

            # Pause the loop
            sleep(randint(1,2))

            # Monitoring the requests
            n_request += 1
            
            elapsed_time = time() - start_time
            print('Request #{}; Frequency: {} requests/s'.format(n_request, n_request/elapsed_time))
            clear_output(wait = True)

            # Pause the loop
            sleep(randint(1,2))

            # Warning for non-200 status codes
            if response.status_code != 200:
                warn('Request #{}; Status code: {}'.format(n_request, response.status_code))
                errors.append(url)

            # Parse the content of the request with BeautifulSoup
            movie_html_soup = BeautifulSoup(response.text, 'html.parser')

            if movie_html_soup.find('div', 'titlebar-title'):
                # Scrape the title
                tp_title = movie_html_soup.find('div', 'titlebar-title').text

                # Set cursors
                the_movie = movie_html_soup.section.div.div.div
                movie_info = the_movie.select('.meta-body-item')
                rating_info = the_movie.select('.rating-item')

                # Scrape extra info

                # Set the defaut value
                tp_dt_reprise = np.nan
                tp_dt_sortie = np.nan
                tp_duration = np.nan
                tp_nation = np.nan
                tp_director = []
                tp_actor = []
                tp_genre = []

                for i in movie_info:
                    if (i.span):
                        # Scrape the dates
                        if i.span.text == 'Date de reprise':
                            tp_dt_reprise = dateparser.parse(i.strong.span.text)
                        elif i.span.text == 'Date de sortie':
                            i.span.decompose()
                            if i.span:
                                tp_dt_sortie = dateparser.parse(i.span.text)
                                i.span.decompose()
                            tp_duration = (i.text).strip().replace('(', '').replace(')', '')
                        # Scrape the directors
                        elif i.span.text == 'De':
                            tp_director = [t.text for t in i.select("a span")]
                        # Scrape the actors
                        elif i.span.text == 'Avec':
                            if i.find('span', 'more'):
                                i.find('span', 'more').decompose()
                            tp_actor = [t.text for t in i.select(".blue-link")]
                        # Scrape the genres
                        elif (i.span.text == 'Genre') | (i.span.text == 'Genres'):
                            tp_genre = [t.text for t in i.select(".blue-link")]
                        # Scrape the nationnality
                        elif (i.span.text == 'Nationalité') | (i.span.text == 'Nationalités'):
                            tp_nation = [t.text.strip() for t in i.select('.nationality')]

                # Scrape the ratings

                # Set the defaut value
                tp_press_rating = np.nan
                tp_nb_press_rating = np.nan
                tp_spec_rating = np.nan
                tp_nb_spec_rating = np.nan

                for r in rating_info:
                    if (r.span):
                        # Scrape the press ratings
                        if r.span.text.strip() == 'Presse':
                            tp_press_rating = r.div.span.text.strip().replace(',','.')
                            tp_nb_press_rating = r.div.span.next_sibling.text.strip().split(' ')[0]
                        # Scrape the users ratings
                        elif r.span.text.strip() == 'Spectateurs':
                            tp_spec_rating = r.div.span.text.strip().replace(',','.')
                            tp_nb_spec_rating = r.div.span.next_sibling.text.strip().split(' ')[0]

                # Append the data
                df_tmp = pd.DataFrame({'title': [tp_title],
                                       'date_reprise': [tp_dt_reprise],
                                       'date_sortie': [tp_dt_sortie],
                                       'duration': [tp_duration],
                                       'director': [tp_director],
                                       'actor': [tp_actor],
                                       'genre': [tp_genre],
                                       'nationality': [tp_nation],
                                       'press_rating': [tp_press_rating],
                                       'nb_press': [tp_nb_press_rating],
                                       'spec_rating': [tp_spec_rating],
                                       'nb_spec': [tp_nb_spec_rating]})
                
                df = pd.concat([df, df_tmp], ignore_index=True)
        except:
            errors.append(url)
            warn('Request #{} fail; Total errors : {}'.format(n_request, len(errors)))
            traceback.print_exc()
            
    # monitoring 
    elapsed_time = time() - start_time
    print('Done; {} requests in {} seconds with {} errors'.format(n_request, round(elapsed_time, 0), len(errors)))
    clear_output(wait = True)
    df.to_csv("allocine_movies.csv")
    errors.to_csv("allocine_errors.csv")
    # return dataframe and errors
    return df, errors

In [None]:
# Load the list of urls 
m_url = pd.read_csv("movie_url.csv")

In [None]:
# Scrape the data 
d, e = ScrapeURL(m_url)