# Open susbtitles parallel dataset

Downloaded the open subtitles dataset from [opus](http://opus.nlpl.eu/OpenSubtitles-v2018.php).
More specifically these where the commands run:

```bash
wget https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/xml/en.zip
wget https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/xml/es.zip
wget https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/xml/en-es.xml.gz
```
These files where uncompresed and arranged, they sum up to ~190G:
```
125G	open_subtitles/en
60G 	open_subtitles/es
3.4G	open_subtitles/en-es.xml
```

In [1]:
import pandas as pd
import xml.etree.ElementTree as et

from pathlib import Path

In [2]:
path = Path.home()/'open_subtitles'

In [3]:
list(path.iterdir())

[PosixPath('/home/javiber/open_subtitles/es'),
 PosixPath('/home/javiber/open_subtitles/README'),
 PosixPath('/home/javiber/open_subtitles/en'),
 PosixPath('/home/javiber/open_subtitles/en-es.xml'),
 PosixPath('/home/javiber/open_subtitles/LICENSE'),
 PosixPath('/home/javiber/open_subtitles/INFO')]

## XCES file

The `en-es.xml` file is the one that links what subtitles should be paired together but it doesn't contain the text, that's on the other two files. The file is in XML and could be parsed with `etree` but that would take almost all of the 63B of memory available, leaving little room for processing loading the two subtitle files that we need to load. For that reason a more low level approach will be made where we can control the amount of memory we are using.

In [3]:
with open(path/'en-es.xml') as ces:
    # readline will return '' when is the end of the file
    lines = ces.readlines()

In [51]:
re.search(r'<link id="SL(\d+)" xtargets="([^;]*);([^"]*)" (?:overlap="([^"]+)" )?/>',
    '<link id="SL123" xtargets="1 2 3;1 2" overlap="0.123" />').groups()

('123', '1 2 3', '1 2', '0.123')

In [55]:
import re

link_grp_re = re.compile(r'^\s*<linkGrp .* fromDoc="([^"]+)" toDoc="([^"]+)".*>\s*$')
link_re = re.compile(r'<link id="SL(\d+)" xtargets="([^;]*);([^"]*)" (?:overlap="([^"]+)" )?/>')

assert link_re.search('<link id="SL2" xtargets="3;" />').groups() == ('2', '3', '', None)
assert link_re.search(
    '<link id="SL123" xtargets="1 2 3;1 2" overlap="0.123" />').groups() == ('123', '1 2 3', '1 2', '0.123')

def link_grp_generator():
    # read all lines on the file
    with open(path/'en-es.xml') as ces:
        lines = ces.readlines()
        
    ignored_grps = 0
    length = len(lines)
    i = 0
    while i < length:
        if lines[i].startswith('<linkGrp'):  # Link group start
            # parse link group
            m = link_grp_re.search(lines[i])
            if m is None:
                raise ValueError(f'could not parse link grp line \'{lines[i]}\'')
            en_file, es_file = m.groups()
            
            # parse links on this group
            links = []
            i += 1  # move to next line
            while '</linkGrp>' not in lines[i]:
                # parse link
                m = link_re.search(lines[i])
                if m is None:
                    raise ValueError(f'could not parse link line \'{lines[i]}\'')
                link_id, en_ids, es_ids, overlap = m.groups()
                # some subtitles don't have a correspondent so we ignore them
                if en_ids and es_ids and overlap is not None:
                    links.append({
                        'id': link_id,
                        'en_ids': en_ids, 
                        'es_ids': es_ids,
                        'overlap': float(overlap[0])
                    })

                i += 1  # move to next line                  

            # filter some link groups where the average overlap is not great
            # in our experience that seems to indicate that the subtitles don't align very well
            # which leads to errors.
            if len(links) and sum([x['overlap'] for x in links])/len(links) > 0.7:
                yield {"en_file": en_file, "es_file": es_file, "links": links}
            else:
                ignored_grps += 1

        i += 1  # move to next line 
    print(f"Ignored {ignored_grps} groups due to low overlap")

In [56]:
def process_lg(link_grp):
    # ignore '.gz' extension
    en_doc = link_grp['en_file'][:-3]
    es_doc = link_grp['es_file'][:-3]
    
    en_root = et.parse(path/en_doc).getroot()
    es_root = et.parse(path/es_doc).getroot()
    
    d = []
    for link in link_grp['links']:
        # split ids by space. the relation between subtitles is many to many
        en_ids = link['en_ids'].split(' ')
        es_ids = link['es_ids'].split(' ')
        
        # get all texts for both languages
        en_texts = []
        for i in en_ids:
            en_texts += [w.text for w in en_root.findall(f'.//s[@id="{i}"]/w')]
            
        es_texts = []
        for i in es_ids:
            es_texts += [w.text for w in es_root.findall(f'.//s[@id="{i}"]/w')]
            
        d.append((
            '#'.join([en_doc, es_doc]),  # link group id
            link.get('id').replace('SL', ''),  # link id
            ' '.join(en_texts),
            ' '.join(es_texts)
        ))
    return d

In [61]:
%%time
import csv

from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool

def process():
    raw_file = open('subtitles2.csv', 'w')
    csv_file = csv.writer(raw_file)
    csv_file.writerow(['files', 'sub_id', 'en', 'es'])  # headers

    buffer = []
    max_buffer = 10000
    total = 0
    with Pool(12) as p:
        for rs in tqdm(p.imap_unordered(process_lg, link_grp_generator()), total=77652, smoothing=0.1):
            buffer += rs
            if len(buffer) > max_buffer:
                csv_file.writerows(buffer)
                total += len(buffer)
                buffer = []
        if buffer:
            total += len(buffer)
            csv_file.writerows(rs)

    raw_file.close()
    print(f'wrote {total} examples')

process()

HBox(children=(IntProgress(value=0, max=77652), HTML(value='')))

Ignored 65915 groups due to low overlap
wrote 7996831 examples
CPU times: user 2min 36s, sys: 7.92 s, total: 2min 44s
Wall time: 51min 25s


In [63]:
df = pd.read_csv('subtitles2.csv')
len(df)

  interactivity=interactivity, compiler=compiler, result=result)


7990657

In [70]:
for _, x in df.sample(10).iterrows():
    print(x.en)
    print(x.es)
    print('~'*50)

Not it cares the university !
¡ No importa la universidad !
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
I said , please go away
Dije , por favor se vayan .
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Why can 't you clean up my messes ?
¿ Por qué no puedes arreglar mi desastre ?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
That was our new proximity alarm .
Eso fue nuestro nuevo La alarma de proximidad .
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
What did I do ?
¿ Qué he hecho ?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Rachel ?
Rachel ?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Yeah .
Sí .
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Um , I appreciate you seeing me .
Te agradezco que accedieras a verme .
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
With altruistic donors ?
¿ Con donantes altruistas ?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
It enhances my strength ... my focus .
Mejora mi fuerza ... mi concentración .
