In [2]:
import re
import random
import os
import pandas as pd

In [3]:
def remove_substring_inside_parentheses(s):
    # Use regular expression to find and remove substring inside parentheses
    s = re.sub(r'\s*\([^)]*\)\s*', '', s)
    # Remove any preceding or trailing whitespaces
    s = s.strip()
    return s

In [4]:
def get_dictionary(folder):
    articles = {}

    for subdir, dirs, files in os.walk(folder):
        for filename in files:
            file_path = os.path.join(subdir, filename)
            if "wiki" in file_path:
                with open(file_path, "r") as f:
                    file_contents = f.read()

                doc_pattern = re.compile(r'<doc id="\d+" url=".*?" title="(.*?)">(.*?)</doc>', re.DOTALL)
                doc_matches = doc_pattern.findall(file_contents)

                for match in doc_matches:
                    title = remove_substring_inside_parentheses(match[0])
                    text = match[1].strip()
                    articles[title] = text
    return articles

In [74]:
szl_articles = get_dictionary("text-szl")
pl_articles = get_dictionary("text-pl")

In [5]:
def get_shared_titles():
    # Read the txt file into a pandas dataframe with two columns
    df = pd.read_csv('titles.txt', sep=' \|\|\| ', engine='python', header=None, names=['szl', 'pl'])
    return df

In [77]:
# shared_titles = get_shared_titles()
# szl_shared_titles = set(shared_titles['szl'].to_list())
# szl_all_titles = set(szl_articles.keys())
# pl_all_titles = set(pl_articles.keys())
# szl_only_titles = list(szl_all_titles - szl_shared_titles)
# pl_only_titles = list(szl_shared_titles - szl_all_titles)
# szl_truly_shared_titles = szl_shared_titles.intersection(szl_all_titles)

In [6]:
def merge_article_dicts():
    article_szl = []
    article_pl = []

    # Loop through each row of shared_titles
    for index, row in shared_titles.iterrows():
        # Get the pl value from shared_titles
        pl_value = row["pl"]
        # Get the corresponding value from articles_pl and append it to article_pl list
        article_pl.append(pl_articles.get(pl_value, "") if pl_value in pl_articles.keys() else float('nan'))

        # Get the szl value from shared_titles
        szl_value = row["szl"]
        # Get the corresponding value from articles_szl and append it to article_szl list
        article_szl.append(szl_articles.get(szl_value, "") if szl_value in szl_articles.keys() else float('nan'))

    # Add article_szl and article_pl as new columns to shared_titles DataFrame
    shared_titles["article_szl"] = article_szl
    shared_titles["article_pl"] = article_pl
    shared_titles.dropna(inplace=True)

In [88]:
merge_article_dicts()
print(len(shared_titles))
print(shared_titles.sample(n=20))
shared_titles.to_csv('pl-szl_articles.tsv', sep='\t', index=False)

13777
                      szl                  pl  \
416                  1844                1844   
12499        Pleiskirchen        Pleiskirchen   
8898         Kůnstantynůw        Konstantynów   
15705          Sůmaliland          Somaliland   
7662             Jaślikůw            Jaślików   
15943           The Corrs           The Corrs   
2996      Charlottesville     Charlottesville   
9645              Lübesse             Lübesse   
7908              Kakulin             Kakulin   
4969            Gaziantep           Gaziantep   
18772            Śwjyntno             Świętno   
15608         Szwajcaryjo          Szwajcaria   
4766   Franciszek Pieczka  Franciszek Pieczka   
2260           Boisseuilh          Boisseuilh   
1154              Allauch             Allauch   
6479                 Gowa               Głowa   
18896                Żydy               Żydzi   
6474              Gotůwka             Gotówka   
1348   Antoni Piechniczek  Antoni Piechniczek   
14785         

In [11]:
# random_sample = random.sample(szl_unshared_titles, 50)
# print(len(szl_truly_shared_titles))
# print(random_sample)
shared_titles = pd.read_csv('pl-szl_articles.tsv', sep='\t')

In [12]:
shared_titles

Unnamed: 0,szl,pl,article_szl,article_pl
0,Silesia,Silesia,(257) Silesia\n\n(257) Silesia – planetoida z ...,Silesia (czasopismo)\n\n„Silesia” – niemieckoj...
1,.ac,.ac,".ac\n\n.ac je to internetowŏ dōmyna, kerŏ je z...",.ac\n\n.ac – krajowa domena internetowa najwyż...
2,.ad,.ad,".ad\n\n.ad je to internetowŏ dōmyna, kerŏ je z...",.ad\n\n.ad – krajowa domena internetowa najwyż...
3,.ae,.ae,".ae\n\n.ae je to internetowŏ dōmyna, kerŏ je z...",.ae\n\n.ae – krajowa domena internetowa najwyż...
4,.af,.af,".af\n\n.af je to internetowŏ dōmyna, kerŏ je z...",.af\n\n.af – krajowa domena internetowa najwyż...
...,...,...,...,...
13772,Ю,Ю,Ю\n\nЮ ю — trzidźestodrugo i przeduostatńo buc...,"Ю\n\nЮ, Юю – podstawowa litera cyrylicy używan..."
13773,Я,Я,Я\n\nЯ я — trzidźestotrzećo i uostatńo buchszt...,"Я\n\nЯ я – ostatnia litera cyrylicy, 33. liter..."
13774,Խ,Խ,"Խ\n\nԽ, խ (che) – trzinŏstŏ buchsztaba alfabyt...","Խ\n\nԽ, խ (che) – trzynasta litera alfabetu or..."
13775,Ẋ,Ẋ,Ẋ\n\nẊẋ – buchsztaba łaćińskiij wersyje czeczy...,Ẋ\n\nẊ ẋ – litera łacińskiej wersji alfabetu c...


In [57]:
char_count = 400
count = sum(1 for value in articles.values() if len(value) > char_count)
print(f"Articles in szl wiki with more than {char_count} chars: {count}")  # Output: 2

Articles in szl wiki with more than 400 chars: 1878


In [7]:
# Get right input format
import nltk
nltk.download('punkt')

def format_input(input_filename, output_filename):
    sentences = []
    # read in the input file and remove empty lines
    with open(input_filename, 'r') as f:
        lines = [line.strip() for line in f if line.strip()]

    for line in lines:
        # split the text into sentences
        sentences.extend(nltk.sent_tokenize(line))

    # write each sentence to the output file on a separate line
    with open(output_filename, 'w') as f:
        for sentence in sentences:
            tokens = nltk.word_tokenize(sentence)
            f.write(' '.join(tokens) + '\n')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bartekjezierski/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# import nltk

# def format_input(input_filename, output_filename):
#     sentences = []
#     # read in the input file and remove empty lines
#     with open(input_filename, 'r') as f:
#         lines = [line.strip() for line in f if line.strip()]

#     # abbreviations to ignore full stops after
#     ignore_abbrev = ["rodz.", "um.", "ur.", "zm."]

#     for line in lines:
#         # split the text into sentences
#         new_sentences = nltk.sent_tokenize(line)
#         for i in range(len(new_sentences) - 1):
#             current_sentence = new_sentences[i]
#             next_sentence = new_sentences[i+1]

#             # check if the current sentence ends with an abbreviation to ignore
#             if current_sentence.strip().endswith(tuple(ignore_abbrev)):
#                 sentences.append(current_sentence + ' ' + next_sentence)
#                 i += 1
#             else:
#                 sentences.append(current_sentence)

#         # add the last sentence of the line
#         sentences.append(new_sentences[-1])

#     # write each sentence to the output file on a separate line
#     with open(output_filename, 'w') as f:
#         for sentence in sentences:
#             tokens = nltk.word_tokenize(sentence)
#             f.write(' '.join(tokens) + '\n')


In [14]:
format_input('source.pl', 'source_token.pl')
format_input('target.szl', 'target_token.szl')

In [10]:
import requests
from bs4 import BeautifulSoup

# specify the URL of the website
url = "https://www.silling.org/slownik/index.php?title=Kategoryj%C5%8F:polsko-%C5%9Bl%C5%8Dnski&pagefrom=a"
pl_words = []
while True:
#     print(url)
    # send a GET request to the website
    response = requests.get(url)

    # parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # find the list element with the class 'mw-category'
    list_element = soup.find('div', {'class': 'mw-category'})

    # find all list items that start with "Polsko-ślōnski:"
    items = list_element.find_all('li', text=lambda t: t.startswith('Polsko-ślōnski:'))

    # print the items
    last_item = ""
    
    for item in items:
        last_item = item.text.split(":")[-1]
        pl_words.append(last_item)
#     print(last_item)
    if items[0] == items[-1]:
        break
        
    url = "https://www.silling.org/slownik/index.php?title=Kategoryj%C5%8F:polsko-%C5%9Bl%C5%8Dnski&pagefrom="+last_item



  items = list_element.find_all('li', text=lambda t: t.startswith('Polsko-ślōnski:'))


AttributeError: 'NoneType' object has no attribute 'find_all'

In [18]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import csv

pl_sil_dict = {}

# for i, pl_word in enumerate(pl_words):
for i in tqdm(range(len(pl_words))):
    pl_word = pl_words[i]
#     print("word", pl_word)

#     if i % 100 == 0:
#     print(f"{i}/{len(pl_words)}")
    url = 'https://www.silling.org/slownik/Polsko-%C5%9Bl%C5%8Dnski:'+pl_word
#     print("url", url)

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    if soup.find('div', {'class': 'mw-parser-output'}):
        list_items = soup.find('div', {'class': 'mw-parser-output'}).find_all('li')

        for item in list_items:
            pl_sil_dict[pl_word] = item.text
#             print(item.text)
with open("output.tsv", "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f, delimiter="\t")
    for key, value in pl_sil_dict.items():
        writer.writerow([key, value])


100%|██████████████████████████████████████████████████████| 21186/21186 [1:17:16<00:00,  4.57it/s]


In [14]:
pl_sil_dict
# len(pl_words)

{'a': 'a',
 'abdukcja': 'abdukcyjŏ',
 'abdykacja': 'abdykacyjŏ',
 'aberracja': 'aberracyjŏ',
 'abiologia': 'abiologijŏ',
 'abiologicznie': 'abiologicznie',
 'abiologiczny': 'abiologiczny',
 'abiotycznie': 'abiotycznie',
 'abiotyczny': 'abiotyczny',
 'ablacja': 'ablacyjŏ'}

In [15]:
with open("output.tsv", "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f, delimiter="\t")
    for key, value in pl_sil_dict.items():
        writer.writerow([key, value])


In [1]:
import pandas as pd

# read the file as dataframe
df = pd.read_csv('dictionary.tsv', delimiter='\t')

# reverse columns 0 and 1
df = df.iloc[:, ::-1]

# save the modified dataframe
df.to_csv('reversed_dictionary.tsv', sep='\t', index=False)


In [18]:
# Save as article per file 
parallel_articles = pd.read_csv('pl-szl_articles.tsv', sep='\t')

# Get right input format
import nltk
nltk.download('punkt')

def format_input(input_filename, output_filename):
    sentences = []
    # read in the input file and remove empty lines
    with open(input_filename, 'r') as f:
        lines = [line.strip() for line in f if line.strip()]

    for line in lines:
        # split the text into sentences
        sentences.extend(nltk.sent_tokenize(line))

    # write each sentence to the output file on a separate line
    with open(output_filename, 'w') as f:
        for sentence in sentences:
            tokens = nltk.word_tokenize(sentence)
            f.write(' '.join(tokens) + '\n')

import os

# create the parallel_articles directory if it doesn't exist
if not os.path.exists('parallel_articles'):
    os.mkdir('parallel_articles')

# loop over each row of the dataframe
for idx, row in parallel_articles.iterrows():
    # get the values of article_szl and article_pl
    szl = row['article_szl']
    pl = row['article_pl']
    
    # create the filename and path for the szl file
    szl_filename = f"{idx}.szl"
    szl_filepath = os.path.join('parallel_articles', szl_filename)
    
    # create the filename and path for the pl file
    pl_filename = f"{idx}.pl"
    pl_filepath = os.path.join('parallel_articles', pl_filename)
    
    # write the szl and pl values to their respective files
    with open(szl_filepath, 'w') as f:
        f.write(szl)
    with open(pl_filepath, 'w') as f:
        f.write(pl)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bartekjezierski/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
# tokenize all the files
import os

dir_path = 'parallel_articles'

for filename in os.listdir(dir_path):
    input_path = os.path.join(dir_path, filename)
    format_input(input_path, input_path)

In [20]:
# truncate pl files which are too long

import os

def truncate_pl_file(szl_file, pl_file):
    with open(szl_file, 'r', encoding='utf-8') as f:
        szl_lines = f.readlines()
    with open(pl_file, 'r', encoding='utf-8') as f:
        pl_lines = f.readlines()

    if len(szl_lines) * 5 < len(pl_lines):
        num_pl_lines = len(szl_lines) * 3
        with open(pl_file, 'w', encoding='utf-8') as f:
            f.writelines(pl_lines[:num_pl_lines])

for i in range(13777):
    szl_file = f'parallel_articles/{i}.szl'
    pl_file = f'parallel_articles/{i}.pl'
    if os.path.isfile(szl_file) and os.path.isfile(pl_file):
        truncate_pl_file(szl_file, pl_file)


In [22]:
# Create a list of tuples containing the required file names

out_dir_path = "alignments"
file_tuples = [(os.path.join(dir_path, f"{i}.pl"), os.path.join(dir_path, f"{i}.szl"), os.path.join(out_dir_path, f"{i}.align")) for i in range(51)]

# Open the batch_job file in write mode
with open("batch_job", "w") as f:
    # Write each file tuple to a separate line in the file
    for file_tuple in file_tuples:
        f.write("\t".join(file_tuple) + "\n")
