In [8]:
import requests
from bs4 import BeautifulSoup
import json

In [13]:
_URL = "https://multimedia.europarl.europa.eu/ga/search?sn=true&st=EPV_EDITED_VIDEOS-WS_VIDEO&ut=EPV_REPLAY-EPV_VIDEO_FOOTAGE-EPV_PHOTO-EPV_AUDIO&ol=EPV_EDITED_VIDEOS&lg=ga_IE&at=1&p_p_id=advanced_search_portlet_AdvancedSearchPortlet&_advanced_search_portlet_AdvancedSearchPortlet_p="

In [102]:
def get_soup(num = "1"):
    req = requests.get(_URL + num)
    if req.status_code != 200:
        raise Exception("Problem scraping page " + num)
    return BeautifulSoup(req.content, "lxml")

In [85]:
def get_last_page_number(soup):
    for last_candidate in soup.find_all("li", {"class": "last"}):
        anchors = last_candidate.find_all("a")
        for anchor in anchors:
#            if "href" in anchor and "AdvancedSearchPortlet_p" in anchor["href"]:
            if "AdvancedSearchPortlet_p" in anchor["href"]:
                eq_pos = anchor["href"].rfind("=")
                return anchor["href"][eq_pos + 1:]

In [71]:
def get_video_urls(videos):
    video_urls = []
    no_url = []
    for video in videos:
        if "europarltv-link" not in video.text:
            no_url.append(video)
        for url in video.find_all("a", {"class", "europarltv-link"}):
            video_urls.append("https://multimedia.europarl.europa.eu" + url["href"])
    return video_urls

In [98]:
def scrape_video_page(num = "1", soup = None):
    if soup is None:
        req = requests.get(_URL + num)
        if req.status_code != 200:
            raise Exception("Problem scraping page " + num)
        soup = BeautifulSoup(req.content, "lxml")
    videos = soup.find_all("div", {"class": "media-preview"})
    return get_video_urls(videos)

In [108]:
soup = get_soup()
last = get_last_page_number(soup)
videos = scrape_video_page(num = "1")
for num in range(2, int(last) + 1):
    videos += scrape_video_page(num = str(num))

In [34]:
_JSON_REQUEST = """
{
"1":{"service":"session","action":"startWidgetSession","widgetId":"_102"},
"2":{"service":"baseEntry","action":"list","ks":"{1:result:ks}",
"filter":{"redirectFromEntryId":"DUMMY_ENTRY_ID"},
"responseProfile":{"type":1,
"fields":"id,referenceId,name,description,thumbnailUrl,dataUrl,duration,msDuration,flavorParamsIds,mediaType,type,tags,dvrStatus,externalSourceType,status"}},
"3":{"service":"baseEntry","action":"getPlaybackContext",
"entryId":"{2:result:objects:0:id}","ks":"{1:result:ks}",
"contextDataParams":{"objectType":"KalturaContextDataParams","flavorTags":"all"}},
"4":{"service":"metadata_metadata","action":"list",
"filter":{"objectType":"KalturaMetadataFilter",
"objectIdEqual":"DUMMY_ENTRY_ID","metadataObjectTypeEqual":"1"},
"ks":"{1:result:ks}"},"apiVersion":"3.3.0","format":1,"ks":"",
"clientTag":"html5:v0.53.7","partnerId":102
}
"""

In [115]:
_MULT_HEADERS = {
    "Content-Type": "application/json",
    "Origin": "https://multimedia.europarl.europa.eu",
    "Sec-Fetch-Site": "same-site",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Dest": "empty"    
}

In [112]:
def get_vid_id(url):
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception("Problem scraping page " + num)
    soup = BeautifulSoup(response.content, "lxml")
    ogvid = soup.find("meta", {"property": "og:video"})
    cont = ogvid["content"]
    cont = cont.split("entryId/")[1]
    cont = cont.split("/v/")[0]
    return cont

In [109]:
def get_json_body(vid_id):
    actual_json = _JSON_REQUEST.replace("\n", "").replace("DUMMY_ENTRY_ID", vid_id)
    response = requests.post("https://kmc.europarltv.europa.eu/api_v3/service/multirequest", headers=_MULT_HEADERS, data=actual_json)
    body = json.loads(response.content)
    return body

In [67]:
def get_subtitles(body):
    subtitles = {}
    for part in body:
        if 'playbackCaptions' in part:
            for subtitle in part['playbackCaptions']:
                if 'languageCode' in subtitle:
                    lang_code = subtitle['languageCode']
                else:
                    lang_code = None
                if 'webVttUrl' in subtitle:
                    webvtt = subtitle['webVttUrl']
                else:
                    webvtt = None
                if webvtt is not None and lang_code is not None:
                    subtitles[lang_code] = webvtt
            else:
                continue
    return subtitles

In [93]:
def get_video(body):
    for part in body:
        if "sources" in part:
            for source in part["sources"]:
                if source["url"].endswith(".mp4"):
                    return source["url"]

In [None]:
data = []
for vid in videos:
    item = {}
    item["url"] = vid
    item["id"] = get_vid_id(vid)
    body = get_json_body(item["id"])
    item["video"] = get_video(body)
    item["vtts"] = get_subtitles(body)
    data.append(item)