# Web scraper to retrieve information from CORELE corpus
#### IMPORTANT: If you are about to use this corpus, you first need to ask for permissions to corpus responsible

They are working on making the audios and transcription easily accessible and this may imply changing the website structure. This code is working on May 12th.

It will access all the available interviews, download their audio files and extract sentences for the interviewed non-native speakers if and only if they have independent time markers for beggining and end of the sentence.

In [110]:
import requests
from xml.etree import ElementTree
import re
import time
import json

In [47]:
main = requests.get("http://cartago.lllf.uam.es/corele/corpus_es.html")

In [48]:
main_root = ElementTree.fromstring(main.content)

In [136]:
# Obtain all interviews and their URLs
urls = []
for child in main_root.iter('{http://www.w3.org/1999/xhtml}a'):
    if (child.get('href').startswith('./interviews')):
        urls.append(child.get('href').replace('./', 'http://cartago.lllf.uam.es/corele/'))

In [137]:
def get_tree (url):
    resp = requests.get(url)
    return ElementTree.fromstring(resp.content)

In [138]:
def get_audio_file (root):
    for child in root.iter():
        if (child.tag == '{http://www.w3.org/1999/xhtml}object'):
            audio_url = child.get('data').replace('dewplayer.swf?mp3=../', 'http://cartago.lllf.uam.es/corele/interviews/').replace('&showtime=1', '')
            audio_file = requests.get(audio_url)
            print('Getting audio from {} with name {}'.format(audio_url, audio_file))
            interview_name = audio_url.replace('http://cartago.lllf.uam.es/corele/interviews/MP3/', '')
            open('./audios/{}'.format(interview_name), 'wb').write(audio_file.content)
            return interview_name

In [139]:
def obtain_sentences (root):
    time_pattern = re.compile("[0-9]{1,2}\:[0-9]{2}")
    sentences = []

    current_dict = {"start_time": '0:00',
                    "end_time": '0:00',
                    "num_sentences": 0,
                    "sentences": []}

    for child in root.iter():   
        
        if child.text:
            if (child.tag=='{http://www.w3.org/1999/xhtml}span' and time_pattern.match(child.text)):
                if (current_dict['num_sentences']):
                    current_dict['end_time'] = child.text.replace('\n', '')
                    sentences.append(current_dict)

                current_dict = {"start_time": child.text.replace('\n', ''),
                           "num_sentences": 0,
                           "sentences": []}

            elif (child.tag == '{http://www.w3.org/1999/xhtml}p'):
                current_dict['num_sentences']+=1
                current_dict['sentences'].append(child.text.replace('\n', ''))

    return sentences

In [140]:
def filter_sentences (sentences: list):
    filtered_sentences = []

    for s in sentences:
        save = True
        for i in s['sentences']:
            if (i.startswith('*ENT')):
                save=False
        if (save):
            filtered_sentences.append(s)
    
    return filtered_sentences

In [143]:
def store_sentences (sentences, interview_name):
    print('Dumping transcription for {}'.format(interview_name))
    with open('./transcriptions/{}.json'.format(interview_name), 'w') as f:
        json.dump(sentences, f)

In [142]:
def get_interview (root):
    interview_name = get_audio_file(root)
    print('Getting {}'.format(interview_name))
    sentences = obtain_sentences(root)
    filtered_sentences = filter_sentences(sentences)
    store_sentences(filtered_sentences, interview_name)

In [114]:
roots = []

for url in urls:
    print("Visiting [{}/{}]: {}".format(urls.index(url), len(urls), url))
    roots.append(get_tree(url))
    time.sleep(2)

Visiting [0/40]: http://cartago.lllf.uam.es/corele/interviews/ES/PORMA2.html
Visiting [1/40]: http://cartago.lllf.uam.es/corele/interviews/ES/PORWA2_1.html
Visiting [2/40]: http://cartago.lllf.uam.es/corele/interviews/ES/PORWA2_2.html
Visiting [3/40]: http://cartago.lllf.uam.es/corele/interviews/ES/PORWB1.html
Visiting [4/40]: http://cartago.lllf.uam.es/corele/interviews/ES/ITAMA2.html
Visiting [5/40]: http://cartago.lllf.uam.es/corele/interviews/ES/ITAWA2.html
Visiting [6/40]: http://cartago.lllf.uam.es/corele/interviews/ES/ITAMB1.html
Visiting [7/40]: http://cartago.lllf.uam.es/corele/interviews/ES/ITAWB1.html
Visiting [8/40]: http://cartago.lllf.uam.es/corele/interviews/ES/FREMA2.html
Visiting [9/40]: http://cartago.lllf.uam.es/corele/interviews/ES/FREWA2.html
Visiting [10/40]: http://cartago.lllf.uam.es/corele/interviews/ES/FREMB1.html
Visiting [11/40]: http://cartago.lllf.uam.es/corele/interviews/ES/FREWB1.html
Visiting [12/40]: http://cartago.lllf.uam.es/corele/interviews/ES/CHIW

In [144]:
for root in roots:
    print('Visiting {}/{}'.format(roots.index(root), len(roots)))
    try:
        get_interview(root)
    except:
        print('ERROR FOR INDEX {}'.format(roots.index(root)))
    time.sleep(1)

Visiting 0/40
Getting audio from http://cartago.lllf.uam.es/corele/interviews/MP3/PORMA2.mp3 with name <Response [200]>
Getting PORMA2.mp3
Dumping transcription for PORMA2.mp3
Visiting 1/40
Getting audio from http://cartago.lllf.uam.es/corele/interviews/MP3/PORWA2_1.mp3 with name <Response [200]>
Getting PORWA2_1.mp3
Dumping transcription for PORWA2_1.mp3
Visiting 2/40
Getting audio from http://cartago.lllf.uam.es/corele/interviews/MP3/PORWA2_2.mp3 with name <Response [200]>
Getting PORWA2_2.mp3
Dumping transcription for PORWA2_2.mp3
Visiting 3/40
Getting audio from http://cartago.lllf.uam.es/corele/interviews/MP3/PORWB1.mp3 with name <Response [200]>
Getting PORWB1.mp3
Dumping transcription for PORWB1.mp3
Visiting 4/40
Getting audio from http://cartago.lllf.uam.es/corele/interviews/MP3/ITAMA2.mp3 with name <Response [200]>
Getting ITAMA2.mp3
Dumping transcription for ITAMA2.mp3
Visiting 5/40
Getting audio from http://cartago.lllf.uam.es/corele/interviews/MP3/ITAWA2.mp3 with name <Resp