In [3]:
import requests, urllib, unicodedata, os.path, glob, pandas as pd
from bs4 import BeautifulSoup
from string import digits, punctuation
from io import StringIO
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams

In [None]:
exclude = punctuation + digits + "\n"

In [None]:
# Lingua France Nova

lfn_wiki = requests.get("https://lfn.wikipedia.org/wiki/Lingua_franca_nova")
lfn_soup = BeautifulSoup(lfn_wiki.content, 'html.parser')
lfn_res = lfn_soup.find("div", attrs={"class": "mw-body-content"}).findAll("p")
lfn_res = [p_tag.get_text() for p_tag in lfn_res]
lfn_res = [''.join(word.strip(exclude).lower() for word in lines) for lines in lfn_res]
lfn_res = [line for line in lfn_res if line]

with open('corpo.txt', 'r', encoding='utf-8') as lfn_corpo_f:
    lfn_corpo = lfn_corpo_f.readlfn_corpo()
    lfn_corpo = [line.strip(exclude).lower() for line in lfn_corpo if not line.startswith('@') and not line.startswith('=') and not line[0].isdigit()]
    lfn_corpo_f.close()
    
lfn_output = lfn_res + lfn_corpo

with open('Lfn.txt', 'w', encoding='utf-8') as lfn_out_f:
    lfn_out_f.writelines("\n".join(lfn_output))
    lfn_out_f.close()

In [None]:
# Lojban

with open('jb2en.tsv') as lojban_f:
    lojban_text = lojban_f.readlines()

lojban_text = [line.split('\t')[0].strip() for line in lojban_text]

lojban_wiki = requests.get("https://jbo.wikipedia.org/wiki/lo_jbobau")
lojban_soup = BeautifulSoup(lojban_wiki.content, 'html.parser')
lojban_res = lojban_soup.find("div", attrs={"class": "mw-parser-output"}).findAll("p")
lojban_res = [p_tag.get_text().strip().replace("\n", " ").lower() for p_tag in lojban_res]
lojban_output = lojban_text + lojban_res

with open('Lojban.txt', 'w', encoding='utf-8') as lojban_out_f:
    lojban_out_f.writelines("\n".join(lojban_output))
    lojban_out_f.close()

In [None]:
# Interlingua

# response = requests.get("https://ia.wikipedia.org/wiki/Interlingua")
# soup = BeautifulSoup(response.content, 'html.parser')
# res = soup.find("div", attrs={"class": "mw-body-content"}).findAll("p")
# res = [unicodedata.normalize('NFKC', p_tag.get_text()) for p_tag in res]
# res = [''.join(word.lower() for word in lines) for lines in res]
with open('English_and_Interlingua_Parallel_Sentences.txt', 'r', encoding='utf-8') as ia_file:
    ia_sentences = ia_file.readlines()
    ia_sentences = [line.lower().strip('[ina]').strip() for line in ia_sentences if '[ENG]' not in line]
    ia_sentences = [line for line in ia_sentences if line]
    ia_file.close()
# output = res + lines
with open('Interlingua.txt', 'w', encoding='utf-8') as ia_out_f:
    ia_out_f.writelines("\n".join(ia_sentences))
    ia_out_f.close()

In [None]:
# Esperanto

# response = requests.get("https://eo.wikipedia.org/wiki/Esperanto")
# wiki_soup = BeautifulSoup(response.content, 'html.parser')
# wiki_res = wiki_soup.find("div", attrs={"class": "mw-parser-output"}).findAll("p")
# wiki_res = [p_tag.get_text().strip().replace("\n", " ").lower() for p_tag in wiki_res]

# content = urllib.request.urlopen('https://www.sacred-texts.com/bib/wb/esp/co1.htm')

# read_content = content.read()
# bible_soup = BeautifulSoup(read_content, 'html.parser')

# [a_tag.decompose() for a_tag in bible_soup.find_all('a')]
# bible_res = [p_tag.get_text().strip().replace("\n", " ").lower() for p_tag in bible_soup.find_all("p")]
eo_sentences = []

for eo_file in glob.glob(os.path.join('./esperanto_corpus', '*.html')):
    with open(eo_file) as eo_html_f:
        eo_soup = BeautifulSoup(eo_html_f)
        remove = eo_soup.find("div", attrs={"class": "tekstokapo"})
        remove.extract()
        eo_res = [unicodedata.normalize('NFKC', tag.get_text().lower()) for tag in eo_soup.find("div", attrs={"class": "tekstarteksto"}).find_all('p')]
        eo_sentences += eo_res
        eo_html_f.close()

with open('Esperanto.txt', 'w', encoding='utf-8') as eo_out_f:
    eo_out_f.writelines("\n".join(eo_sentences))
    eo_out_f.close()

In [None]:
# Lingwa de Planeta

# Humanist Speech PDF
ldp_outstr = StringIO()
with open('humanist_speech_trans.pdf', 'rb') as ldp_f_1:
    extract_text_to_fp(ldp_f_1, ldp_outstr, laparams=LAParams(),
                       output_type='html', codec=None)
    
ldp_soup_1 = BeautifulSoup(ldp_outstr.getvalue())
ldp_text_1 = [sen.get_text().strip().replace("\n", " ").lower() for sen in ldp_soup_1.find_all("span", style="font-family: font00000000296ca43f; font-size:12px") if "\nPage" not in sen.get_text()]


# Animal Farm PDF
ldp_outstr_2 = StringIO()
with open('animal_farm_trans.pdf', 'rb') as ldp_f_2:
    extract_text_to_fp(ldp_f_2, ldp_outstr_2, laparams=LAParams(),
                       output_type='html', codec=None)

ldp_soup_2 = BeautifulSoup(ldp_outstr_2.getvalue())
ldp_text_2 = [sen.get_text().strip().replace("\n", " ").lower() for sen in ldp_soup_2.find_all(["spa"], style="font-family: font00000000296cac0e; font-size:12px") if "\nPage" not in sen.get_text()]

with open('LdP.txt', 'w', encoding='utf-8') as ldp_out_f:
    ldp_out_f.write(" ".join(ldp_text_1))
    ldp_out_f.write(" ".join(ldp_text_2))

In [None]:
# Klingon

klingon_content = urllib.request.urlopen('http://klingon.wiki/En/TheKlingonWayPhrases')
read_content = klingon_content.read()
klingon_soup = BeautifulSoup(read_content, 'html.parser')

with open('Klingon.txt', 'w', encoding='utf-8') as klingon_f:
    for sen in klingon_soup.find_all("b", limit=156):
        klingon_f.write(sen.get_text() + " ")

In [None]:
# Dothraki

dothraki_outstr = StringIO()
with open('Dothraki_dic.pdf', 'rb') as dothraki_f:
    extract_text_to_fp(dothraki_f, dothraki_outstr, laparams=LAParams(),
                       output_type='html', codec=None)
    
dothraki_soup = BeautifulSoup(dothraki_outstr.getvalue())
dothraki_res = [word.get_text() for word in dothraki_soup.find_all(["span"], style="font-family: URWPalladioL-Bold; font-size:10px", limit=1455) if "\n" not in word.get_text()] 

with open('Dothraki.txt', 'w', encoding='utf-8') as dothraki_out_f:
    dothraki_out_f.write(" ".join(dothraki_res))