# Scrape Data Sources

This script is a combination of multiple different unique scripts used to scrape the data sources used for our model. This script then holds the translation workflow for all files. The data sources scraped in this notebook are

Sources scraped direct from web/csv:
* Amis ILRDF.org Videos
* Paiwan ILRDF.org Videos
* Amis Glosbe
* Amis Virginia Fey

Sources that were transposed from existing XML or .txt files:
* Amis ePark
* Paiwan ePark
* Amis Bible
* Amis 2022 Study

Sources that were manually aligned:
* Amis Apology
* Paiwan Apology

Hunter Scheppat | scheppat@bc.edu


#### Preliminaries

In [None]:
# Install necessary libraries
!pip install requests beautifulsoup4

In [None]:
from google.colab import drive
import requests
from bs4 import BeautifulSoup
import os

# Mount Google Drive
drive.mount('/content/drive')

## 1. Scrape the Amis & Paiwan Videos

In scraping the Amis/Paiwan videos were decided on just downloading the entire html of the site page. Why? because we could not get our script to work with the 'load more' button on the bottom of the page, which caused us to miss out on scraping lots of videos. This script could probably be improved to work with that.

To start, we parse the dowloaned html of the site for all the video IDs, then navigate to each ID and scrape it

#### 1a. Parse HTML

In [None]:
from google.colab import drive
import requests
from bs4 import BeautifulSoup
import os

# Mount Google Drive
drive.mount('/content/drive')

# Base URL for scraping
base_url = 'https://ailt.ilrdf.org.tw/colloquial/'

# Set the directory within Google Drive for files and HTML source
drive_base_path = '/content/drive/MyDrive/formosan_mt_project/translations/'
save_directory = os.path.join(drive_base_path, 'amis_videos')

# Ensure the save directory exists
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

# Function to read the HTML file and extract page numbers
def get_pages_to_scrape(html_file_path):
    with open(html_file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    soup = BeautifulSoup(content, 'html.parser')
    data_ids = [int(tag['data-id']) for tag in soup.find_all(attrs={'data-id': True})]
    print('Total files: ' + str(len(data_ids)))
    return data_ids

# Function to detect if the page is in reversed format
def is_reversed_format(page_number):
    # List of known reversed format video page numbers
    reversed_pages = [686, 708, 724, 738]
    return page_number in reversed_pages

# Modified scraping function
def scrape_translations(page_number):
    ind_filename = os.path.join(save_directory, f"{page_number}-indigenous.txt")
    zh_filename = os.path.join(save_directory, f"{page_number}-chinese.txt")

    # Skip if files exist
    if os.path.exists(ind_filename) and os.path.exists(zh_filename):
        print(f"Skipping {page_number}: files already exist.")
        return

    url = f"{base_url}{page_number}"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch {url}")
        return

    reversed_format = is_reversed_format(page_number)

    soup = BeautifulSoup(response.text, 'html.parser')
    translation_inds = soup.find_all('div', class_='translation_ind')
    translation_zhs = soup.find_all('div', class_='translation_zh')

    with open(ind_filename, 'w', encoding='utf-8') as ind_file, open(zh_filename, 'w', encoding='utf-8') as zh_file:
        for ind_div, zh_div in zip(translation_inds, translation_zhs):
            if reversed_format:
                # In reversed format, swap the roles of ind and zh
                ind_texts = zh_div.get_text(strip=True).split()
                zh_line = ind_div.get_text(strip=True)
                ind_line = ' '.join(ind_texts)
            else:
                # Normal format processing
                ind_texts = [span.text for span in ind_div.find_all('span', class_='ind_dictionary')]
                ind_line = ' '.join(ind_texts)
                zh_line = zh_div.get_text(strip=True)

            ind_file.write(ind_line + '\n')
            zh_file.write(zh_line + '\n')

def main():
    html_file_path = os.path.join(drive_base_path, 'html.txt')
    pages_to_scrape = get_pages_to_scrape(html_file_path)
    for page_number in pages_to_scrape:
        scrape_translations(page_number)

if __name__ == '__main__':
    main()


#### 1b. Clean files to remove code switching

In [None]:
# Clean the files and maintain the parralel structure

import os
import re

def contains_english_characters(string):
    return bool(re.search('[a-zA-Z]', string))

def contains_chinese_characters(string):
    return bool(re.search('[\u4e00-\u9fff]', string))

def process_file_pair(amis_file_path, chinese_file_path):
    with open(amis_file_path, 'r', encoding='utf-8') as amis_file, open(chinese_file_path, 'r', encoding='utf-8') as chinese_file:
        amis_lines = amis_file.readlines()
        chinese_lines = chinese_file.readlines()

    assert len(amis_lines) == len(chinese_lines), "Files are not parallel!"

    # Determine which lines to keep
    lines_to_keep = []
    for idx, (amis_line, chinese_line) in enumerate(zip(amis_lines, chinese_lines)):
        if contains_english_characters(chinese_line) or contains_chinese_characters(amis_line) or (amis_line.strip() == "" or chinese_line.strip() == ""):
            continue  # Skip this line in both files
        lines_to_keep.append(idx)

    # Rewrite the files without the removed lines
    with open(amis_file_path, 'w', encoding='utf-8') as amis_file, open(chinese_file_path, 'w', encoding='utf-8') as chinese_file:
        for idx in lines_to_keep:
            amis_file.write(amis_lines[idx])
            chinese_file.write(chinese_lines[idx])

def main():
    directory = '/content/drive/MyDrive/formosan_mt_project/translations/amis_videos/'
    file_pairs = []

    # Assuming file naming convention is consistent and all files are in the same directory
    for filename in os.listdir(directory):
        if 'indigenous' in filename:
            num = filename.split('-')[0]
            amis_file_path = os.path.join(directory, f'{num}-indigenous.txt')
            chinese_file_path = os.path.join(directory, f'{num}-chinese.txt')
            if os.path.exists(chinese_file_path):
                file_pairs.append((amis_file_path, chinese_file_path))

    for amis_file, chinese_file in file_pairs:
        process_file_pair(amis_file, chinese_file)
        print(f'Processed {amis_file} and {chinese_file}')

if __name__ == '__main__':
    main()


#### 1c. Validate files are still parallel after cleaning

In [None]:
# Validate that files are still parralel after cleaning
def validate_file_pairs(directory):
    file_pairs = []
    for filename in os.listdir(directory):
        if 'indigenous' in filename:
            num = filename.split('-')[0]
            amis_file_path = os.path.join(directory, f'{num}-indigenous.txt')
            chinese_file_path = os.path.join(directory, f'{num}-chinese.txt')
            if os.path.exists(chinese_file_path):
                file_pairs.append((amis_file_path, chinese_file_path))

    for amis_file, chinese_file in file_pairs:
        with open(amis_file, 'r', encoding='utf-8') as f_amis, open(chinese_file, 'r', encoding='utf-8') as f_chinese:
            amis_lines = f_amis.readlines()
            chinese_lines = f_chinese.readlines()

        if len(amis_lines) != len(chinese_lines):
            print(f'Validation failed for files: {amis_file} and {chinese_file}. Line count mismatch: {len(amis_lines)} (Amis) vs {len(chinese_lines)} (Chinese)')
        else:
            print(f'Validation passed for files: {amis_file} and {chinese_file}. Both files have {len(amis_lines)} lines.')

def main():
    directory = '/content/drive/MyDrive/formosan_mt_project/translations/amis_videos/'
    # Process the files first (your existing processing logic here)

    # After processing, validate the files
    validate_file_pairs(directory)

if __name__ == '__main__':
    main()



**THE PROCESS ABOVE CAN BE REPEATED FOR THE PAIWAN VIDEOS**

## 2. Scrape the Amis Glosbe.com dictionary
The Glosbe is very tricky, it doesn't let you view all the sentences, you have to query some word. So we try to query the 10 most common Amis words, get all the sentences, and then remove dupes

In [None]:
# Top 50 amis words
word_freq_list = [('to', 10711), ('i', 10252), ('ko', 10188), ('a', 8331), ('no', 7030), ('o', 5346), ('sa', 3452), ('haw', 3330), ('ku', 3062), ('han', 2867), ('tu', 2765), ('ako', 2564), ('ira', 2501), ('kako', 2426), ('nu', 2137), ('u', 2132), ('ho', 2074), ('hay', 1577), ('ano', 1516), ('kora', 1483), ('caay', 1423), ('itini', 1379), ('itiya', 1350), ('kiya', 1335), ('nira', 1251), ('kami', 1218), ('mako', 1144), ('kira', 1133), ('niyam', 1108), ('saan', 1058), ('wawa', 1019), ('ci', 1017), ('san', 994), ('saka', 980), ('sato', 972), ('ya', 969), ('awaay', 938), ('sanay', 917), ('ka', 898), ('hananay', 884), ('kita', 827), ('mita', 781), ('ta', 771), ('matoʼasay', 768), ('sowal', 761), ('niyaro', 742), ('aca', 730), ('anini', 674), ('tayra', 670), ('ha', 665)]
top_words = {word for word, freq in word_freq_list}

#### Query these and save

In [None]:
import requests
from bs4 import BeautifulSoup
import os
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

# Initialize base URL and top words
base_url = "https://glosbe.com/ami/zh/"

# Paths for the output files
indigenous_path = '/content/drive/MyDrive/formosan_mt_project/translations/amis_glosbe/glosbe-indigenous.txt'
chinese_path = '/content/drive/MyDrive/formosan_mt_project/translations/amis_glosbe/glosbe-chinese.txt'

# Helper function to save sentences to files ensuring parallel corpus structure
def save_sentences(indigenous_sentences, chinese_sentences):
    with open(indigenous_path, 'a', encoding='utf-8') as fi, open(chinese_path, 'a', encoding='utf-8') as fc:
        for ind, chi in zip(indigenous_sentences, chinese_sentences):
            fi.write(ind + '\n')
            fc.write(chi + '\n')

# Function to process each word and maintain parallel structure
def scrape_for_word(word):
    url = base_url + word
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Initialize data structures to track sentences and ensure uniqueness
    seen_pairs = set()
    indigenous_sentences = []
    chinese_sentences = []

    # Internal function to process a page
    def process_page(soup):
        divs = soup.find_all('div', class_='flex')
        for div in divs:
            ami_text = div.find('div', attrs={'lang': 'ami'})
            zh_text = div.find('div', attrs={'lang': 'zh'})
            if ami_text and zh_text:
                ami_sentence = ami_text.text.strip()
                zh_sentence = zh_text.text.strip()
                pair = (ami_sentence, zh_sentence)
                if pair not in seen_pairs:
                    seen_pairs.add(pair)
                    indigenous_sentences.append(ami_sentence)
                    chinese_sentences.append(zh_sentence)

    process_page(soup)

    # Manage "Load More" functionality
    load_more = soup.find('button', attrs={'data-element': 'fragment-loader'})
    while load_more:
        more_url = 'https://glosbe.com' + load_more['data-fragment-url']
        response = requests.get(more_url)
        more_soup = BeautifulSoup(response.text, 'html.parser')
        process_page(more_soup)
        load_more = more_soup.find('button', attrs={'data-element': 'fragment-loader'})

    # Save the sentences ensuring parallel structure
    save_sentences(indigenous_sentences, chinese_sentences)

# Processing each word
for word in top_words:
    scrape_for_word(word)

print("Scraping complete. Files saved.")


## 3. Scrape the Amis Virginia Fey

This part was fairly simple, as it was already in a csv

In [None]:
# Path to your CSV file
csv_file_path = '/content/drive/MyDrive/formosan_mt_project/translations/virginia_fey.csv'

# Paths where the text files will be saved
indigenous_file_path = '/content/drive/MyDrive/formosan_mt_project/translations/virginia-fey-indigenous.txt'
english_file_path = '/content/drive/MyDrive/formosan_mt_project/translations/virginia-fey-english.txt'

# Read the CSV file
df = pd.read_csv(csv_file_path)

# Assuming column 'a' contains indigenous sentences and column 'b' contains English sentences
# Check if column names are correct, adjust 'a' and 'b' if needed
indigenous_sentences = df['Amis']
english_sentences = df['English']

# Save indigenous sentences to a text file
with open(indigenous_file_path, 'w', encoding='utf-8') as f:
    for sentence in indigenous_sentences:
        f.write(sentence + '\n')

# Save English sentences to a text file
with open(english_file_path, 'w', encoding='utf-8') as f:
    for sentence in english_sentences:
        f.write(sentence + '\n')

print("Files have been saved successfully.")

## 4. Scrape the Amis & Paiwan ePark

These files were already in XML, so we took them out of XML format

In [None]:
import os
import xml.etree.ElementTree as ET
import random

def generate_unique_id(used_ids):
    while True:
        unique_id = random.randint(100, 999)
        if unique_id not in used_ids:
            used_ids.add(unique_id)
            return unique_id

def parse_and_save(xml_file_path, save_directory, used_ids):
    unique_id = generate_unique_id(used_ids)
    root_name = os.path.splitext(os.path.basename(xml_file_path))[0]
    amis_file_path = os.path.join(save_directory, f"{unique_id}-{root_name}-paiwan.txt")
    english_file_path = os.path.join(save_directory, f"{unique_id}-{root_name}-english.en")

    tree = ET.parse(xml_file_path)
    root = tree.getroot()
    namespaces = {'xml': 'http://www.w3.org/XML/1998/namespace'}

    os.makedirs(save_directory, exist_ok=True)

    with open(amis_file_path, 'w', encoding='utf-8') as amis_file, \
            open(english_file_path, 'w', encoding='utf-8') as english_file:

            for line_number, s in enumerate(root.findall('.//S')):
                form = s.find('FORM')
                english = s.find('.//TRANSL[@xml:lang="en"]', namespaces=namespaces)

                if form is not None and english is not None and form.text and english.text:
                    amis_text = form.text.strip()
                    english_text = english.text.strip()

                    if amis_text and english_text:
                        # Modification: Write only if content exists
                        amis_file.write(amis_text + "\n")
                        english_file.write(english_text + "\n")
                    else:
                        print(f"Skipped empty text entry at S ID {s.attrib['id']}")
                else:
                    print(f"Skipped due to missing translation at S ID {s.attrib['id']}")

    # Cross-Check for Line Correspondence
    with open(amis_file_path) as amis_file, open(english_file_path) as english_file:
        amis_lines = amis_file.readlines()
        english_lines = english_file.readlines()

        if len(amis_lines) != len(english_lines):
            print(f"Error: Line mismatch in {amis_file_path} and {english_file_path}")
        else:
            for line_number, (amis_line, english_line) in enumerate(zip(amis_lines, english_lines)):
                if amis_line.strip() != english_line.strip():
                    print(f"Error: Mismatch at Line {line_number + 1} in files {amis_file_path} and {english_file_path}")
                    break  # Stop after reporting the first mismatch

    print(f"Written files: {amis_file_path} and {english_file_path}")


def process_directory(directory_path, save_directory):
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.xml'):
            file_path = os.path.join(directory_path, file_name)
            parse_and_save(file_path, save_directory, used_ids)

# Example directory structure and paths
directories = [
    '/content/drive/MyDrive/formosan_mt_project/xml/paiwan_videos/',
]
save_directory = '/content/drive/MyDrive/formosan_mt_project/translations/paiwan/paiwan_videos/'
used_ids = set()

for directory in directories:
    process_directory(directory, save_directory)

print("Processing complete.")


**THIS CODE CAN BE RESUED FOR ANY XML, JUST CHANGE THE LANGUAGE TAG**

## 5. Scrape the 2022 Study

This study was already aligned but the dictionary was not parallel yet

#### Split the dictionary portion, which was individual words

In [None]:
# Path to the Amis -> Chinese dictionary file
current_dict = '/content/gdrive/MyDrive/formosan_mt_project/amis_dictionary/ami-cmn.txt'

# Directory to save the output files
save_location = '/content/gdrive/MyDrive/formosan_mt_project/translations/amis/dictionary'

# Make sure the save directory exists
import os
if not os.path.exists(save_location):
    os.makedirs(save_location)

# Filenames for the output Amis and Chinese files
amis_file_path = os.path.join(save_location, 'amis-dict.txt')
chinese_file_path = os.path.join(save_location, 'chinese-dict.txt')

# Read the dictionary file and split into separate Amis and Chinese files
with open(current_dict, 'r', encoding='utf-8') as file, \
     open(amis_file_path, 'w', encoding='utf-8') as amis_file, \
     open(chinese_file_path, 'w', encoding='utf-8') as chinese_file:

    for line in file:
        # Split the line into Amis word and Chinese translation
        # Assuming there is a consistent separator such as multiple spaces
        parts = line.strip().split(maxsplit=1)
        if len(parts) == 2:
            amis_word, chinese_translation = parts
            # Write to respective files
            amis_file.write(amis_word + '\n')
            chinese_file.write(chinese_translation + '\n')

print("Splitting complete! Files saved at:", save_location)


## 6. Translate using DeepL

In this script we translate by calling the DeepL api, this script can be modified to work with pretty much any data source

In [None]:
!pip install deepl

In [None]:
# Translate with DeepL
import os
import deepl

# Replace with the path to your directory containing the files to be translated
directory_path = '/content/gdrive/MyDrive/formosan_mt_project/translations/amis/dictionary/'

# DO NOT TAKE MY API KEY PLEASE!!! :)
auth_key = "64b1b011-8415-4f50-b3f1-ba4def3a4374:fx"
translator = deepl.Translator(auth_key)

def translate_file(file_path):
    base_name = os.path.basename(file_path)
    file_number = base_name.split('-')[0]  # Assumes the file name format is "number-chinese.txt"
    output_file_name = f"{file_number}-english.txt"
    output_file_path = os.path.join(directory_path, output_file_name)

    # Check if the English translation file already exists
    if os.path.exists(output_file_path):
        print(f"Skipping translation for {file_number}: English file already exists.")
        return

    lines_to_translate = []
    with open(file_path, 'r', encoding='utf-8') as input_file:
        lines_to_translate = [line.strip() for line in input_file.readlines() if line.strip()]  # Skip empty lines

    # DeepL API supports up to 50 texts in one request
    batch_size = 50
    translated_lines = []
    for i in range(0, len(lines_to_translate), batch_size):
        batch = lines_to_translate[i:i+batch_size]
        results = translator.translate_text(batch, source_lang="ZH", target_lang="EN-US")
        translated_lines.extend([result.text for result in results])

    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.write('\n'.join(translated_lines))

for file in os.listdir(directory_path):
    if file.endswith("-chinese.zh"):
        file_path = os.path.join(directory_path, file)
        print(f"Processing {file}...")
        translate_file(file_path)
        print(f"Finished processing {file}.")
