# Dataset Generation

In [71]:
import csv
import random

import pandas as pd
import requests
from bs4 import BeautifulSoup


Setting random seed, scraping a website with english sentences with word "mountains"

In [72]:
random.seed(42)

url = "https://gikken.co/mate-translate/sentences/english/mountains"

response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")

    sentence_container = soup.find("div", class_="example-sentences")

    sentences = [
        p.get_text(strip=True).replace("Translate from English to English", "").strip()
        for p in sentence_container.find_all("p")
    ]

    print(sentences)
else:
    print("error")

['I was in the mountains.', 'Mountains in the ocean and even whole ranges are still being discovered, and most of them have not yet been explored in detail.', 'I would rather go to the mountains than to the beach.', 'Which do you like better, the sea or the mountains?', 'Some people like the sea; others prefer the mountains.', 'We watched the sun setting behind the mountains.', 'The mountains are a lush green in summer.', 'I was roaming over the mountains all through the night.', 'The mountains in the Himalayas are higher than those in the Andes.', 'It was so nice a day that we went for a hike in the mountains.', 'The girl is getting over mountains of difficulties.', 'The path twists through the mountains.', 'The mountains look nicer from a distance.', 'The two mountains are of equal height.', 'The mountains in this part of the country are full of variety.', 'This river rises in the mountains in Nagano.', "It's very pleasant here in the mountains.", 'This dog is trained to save people 

Then we create a list with mountain names and create a function, that replaces word "mountains" with a random mountain name

In [73]:
mountain_list = [
    "Mount Everest",
    "K2",
    "Mount Kilimanjaro",
    "Denali",
    "Mont Blanc",
    "Mount Whitney",
    "Mount Rainier",
    "Mount Elbert",
    "Mount Logan",
    "Mount Saint Elias",
    "Matterhorn",
    "Ben Nevis",
    "Mount Elbrus",
    "Grossglockner",
    "Annapurna",
    "Kangchenjunga",
    "Nanga Parbat",
    "Mount Fuji",
    "Aconcagua",
    "Huascarán",
    "Chimborazo",
    "Mount KenyaSimien MountainsDrakensberg",
]

def replace_mountains(sentences, mountain_names):
    replaced_sentences = []
    
    for sentence in sentences:
        # select random mountain name
        mountain_name = random.choice(mountain_names)
        # replace uppercase mountains with a name
        replaced_sentence = sentence.replace("Mountains", mountain_name)
        
        mountain_name = random.choice(mountain_names)
        # replace lowercase mountains with a name
        replaced_sentence = replaced_sentence.replace("mountains", mountain_name)
        replaced_sentences.append(replaced_sentence)
        
    return replaced_sentences

replaces_sentences = replace_mountains(sentences, mountain_list)

csv_file_path = "data/dataset.csv"
with open(csv_file_path, "w", newline="", encoding="utf-8") as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(["Sentence"])
    csv_writer.writerows([[sentence] for sentence in replaces_sentences])

Save the dataset in csv

In [74]:
df = pd.read_csv('data/dataset.csv')
df.head()

Unnamed: 0,Sentence
0,I was in the Denali.
1,Mount Everest in the ocean and even whole rang...
2,I would rather go to the Mount Elbert than to ...
3,"Which do you like better, the sea or the Denali?"
4,Some people like the sea; others prefer the Mo...


Then we create annotations in BIO-format

In [75]:
def create_annotations(sentence):
    # Split the sentence into individual words
    words = sentence.split()
    # Initialize all words with "O" (Outside of any mountain name)
    annotations = ["O"] * len(words)

    # Iterate through a predefined list of mountain names
    for mountain_name in mountain_list:
        if mountain_name in sentence:
            mountain_words = mountain_name.split()
            # Find the start index of the mountain name in the sentence
            start_index = sentence.find(mountain_name)
            # Convert character index to word index
            start_word_index = len(sentence[:start_index].split())

            # Assign BIO tags to the words in the mountain name
            for i in range(len(mountain_words)):
                if start_word_index + i < len(annotations):
                    if i == 0:
                        annotations[start_word_index + i] = "B-MNTN"
                    else:
                        annotations[start_word_index + i] = "I-MNTN"

    return " ".join(annotations)  # Return annotations as a space-separated string


df["Annotation"] = df["Sentence"].apply(create_annotations)

df.head()

Unnamed: 0,Sentence,Annotation
0,I was in the Denali.,O O O O B-MNTN
1,Mount Everest in the ocean and even whole rang...,B-MNTN I-MNTN O O O O O O O O O O O O O O O O ...
2,I would rather go to the Mount Elbert than to ...,O O O O O O B-MNTN I-MNTN O O O O
3,"Which do you like better, the sea or the Denali?",O O O O O O O O O B-MNTN
4,Some people like the sea; others prefer the Mo...,O O O O O O O O B-MNTN I-MNTN


In [76]:
output_csv_name = 'data/annotated_dataset.csv'
df.to_csv(output_csv_name, index=False)