Data scraping from https://wiki.ivao.aero/en/home/training/documentation/Complete_phraseology with langchain

In [47]:
from langchain.document_loaders import AsyncHtmlLoader

list_url = [
    'https://wiki.ivao.aero/en/home/training/documentation/Complete_phraseology']
loader = AsyncHtmlLoader(list_url)
docs = loader.load()

Fetching pages:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.81it/s]


In [56]:
def str_to_txt(str, filename, append=True):
    if append:
        mode = 'a'
    else:
        mode = 'w'
    with open(filename, mode, encoding="utf-8") as f:
        f.write(str)

In [48]:
docs



In [49]:
from langchain.document_transformers import Html2TextTransformer

html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)


In [50]:
docs_transformed



In [53]:
# Result
docs_transformed[0].page_content



In [57]:
str_to_txt(docs_transformed[0].page_content, 'ivao_phraseology_unstructured.txt')

Data scraping from https://contentzone.eurocontrol.int/phraseology/ with selenium webdriver

In [46]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time


def parse_data(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    data = {
        'syntax': '',
        'when_used': '',
        'category': '',
        'examples': '',
        'who_says': ''
    }

    # Extracting 'Syntax'
    syntax_div = soup.find('div', class_='phraseSyntax')
    data['syntax'] = syntax_div.text.strip() if syntax_div else 'Not found'

    # Extracting 'When Used'
    when_used_div = soup.find('div', class_='phraseWhenUsed')
    data['when_used'] = when_used_div.text.strip(
    ) if when_used_div else 'Not found'

    # Extracting 'Category'
    category_div = soup.find('div', class_='phraseCategoryData')
    data['category'] = category_div.text.strip() if category_div else 'Not found'

    # Extracting 'Examples'
    examples_div = soup.find('div', class_='phraseExamples')
    if examples_div:
        examples_list = examples_div.find('ul', id='phraseExamplesList')
        examples = [li.text.strip() for li in examples_list.find_all(
            'li')] if examples_list else []
        data['examples'] = examples

    # Extracting 'Who Says'
    who_says_div = soup.find('div', class_='phraseWhoSays')
    data['who_says'] = who_says_div.text.strip() if who_says_div else 'Not found'
    return data

# URL to open
url = "https://contentzone.eurocontrol.int/phraseology/"
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Open the URL
driver.get(url)

# Allow time for the page to load
time.sleep(4)
i = 256
# Find all the list elements
list_elements = driver.find_elements(By.CLASS_NAME, "pLList")[i:]
# Iterate through each list element
# Iterate through each list element
while True:
    if not list_elements:
        break
    print(len(list_elements))
    print(list_elements[0].text)
    # Click on the first list element
    list_elements[0].click()

    # Wait for the new content to load
    time.sleep(2)
    
    # Scrape information from the clicked element
    scraped_data = parse_data(driver.page_source)
    print(scraped_data)
    
    # Create or open a text file for storing scraped data
    output_file = open("scraped_data.txt", "a", encoding="utf-8")
    # Write scraped_data to the text file
    output_file.write(str(scraped_data) + "\n")
    # Close the text file
    output_file.close()




    # Navigate back to the original page
    driver.back()

    # Wait for the original page to load
    time.sleep(2)

    i += 1

    # if i exceeds the length of the list, break the loop
    if len(list_elements) < 1:
        break
    
    # Re-find all the list elements, excluding the first one
    list_elements = driver.find_elements(By.CLASS_NAME, "pLList")[i:]



# Close the browser
driver.quit()

254
LANDING SURFACE (condition).
{'syntax': 'Syntax\nLANDING SURFACE (condition).', 'when_used': 'When Used\nTo pass information on aerodrome conditions.', 'category': 'Aerodrome and Vicinity', 'examples': ['Landing surface dry.'], 'who_says': 'Who Says\nThe Controller'}
253
LEAVE (significant point) HEADING (three digits).
{'syntax': 'Syntax\nLEAVE (significant point) HEADING (three digits).', 'when_used': 'When Used\nVectoringApplicable when an ATS surveillance system is used in the provision of ATSNote.— When it is necessary to specify a reason for vectoring or for the above manoeuvres, the following phraseologies should be used:a) DUE TRAFFIC.b) FOR SPACING.c) FOR DELAY.d) FOR DOWNWIND (or BASE, or FINAL).', 'category': 'General ATS Surveillance Service Phraseologies', 'examples': ['LGL123 leave BELLO heading zero two zero.'], 'who_says': 'Who Says\nThe Controller'}
252
LEAVE CONTROLLED AIRSPACE (or CONTROL ZONE) [VIA (significant point or route)] AT (level) (or CLIMBING, or DESCEN

Dataset gathering from HuggingFace Datasets library

In [1]:
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("jlvdoorn/atcosim")

Downloading readme: 100%|██████████| 698/698 [00:00<?, ?B/s] 
Downloading data: 100%|██████████| 495M/495M [02:20<00:00, 3.52MB/s] 
Downloading data: 100%|██████████| 492M/492M [02:15<00:00, 3.63MB/s] 
Downloading data: 100%|██████████| 432M/432M [01:59<00:00, 3.62MB/s] 
Downloading data: 100%|██████████| 501M/501M [02:24<00:00, 3.47MB/s] 
Downloading data: 100%|██████████| 479M/479M [02:21<00:00, 3.38MB/s] 
Generating train split: 100%|██████████| 7646/7646 [00:10<00:00, 707.47 examples/s]
Generating validation split: 100%|██████████| 1913/1913 [00:01<00:00, 998.43 examples/s] 


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'text'],
        num_rows: 7646
    })
    validation: Dataset({
        features: ['audio', 'text'],
        num_rows: 1913
    })
})

In [18]:
dataset_train = dataset['train']['text']
dataset_val = dataset['validation']['text']

In [20]:
dataset_train[0:5]

[' contact geneva one two eight decimal one five good bye ',
 'swissair six six zero romeo contact marseille one two five decimal eight five good bye ',
 'bonjour luxair five five one identified cleared passeiry torino flight level three three zero ',
 'lufthansa four seven two three resume own navigation to willisau ',
 'alitalia two zero one set course direct to torino ']

In [21]:
atcosim_train_txt_path = 'C:\\Users\\justa\\OneDrive\\Desktop\\Developer\\Thesis_ATM_with_LLM\\data\\unstructured\\atcosim_train.txt'
atcosim_val_txt_path = 'C:\\Users\\justa\\OneDrive\\Desktop\\Developer\\Thesis_ATM_with_LLM\\data\\unstructured\\atcosim_val.txt'

# Writing the list to a text file, each element on a new line
with open(atcosim_train_txt_path, 'w') as train_file, open(atcosim_val_txt_path, 'w') as val_file:
    for line in dataset_train:
        train_file.write(line + '\n')
    for line in dataset_val:
        val_file.write(line + '\n')