In [1]:
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable


In [5]:
import json
import pandas as pd
import csv

def convert_json_to_alignment(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        data = json.load(file)

    aligned_sentences = []
    for item in data:
        hut_text = item['hut']
        ukr_text = item['ukr']
        aligned_sentences.append(f"{hut_text} ||| {ukr_text}")

    with open(output_file, 'w', encoding='utf-8') as file:
        for sentence in aligned_sentences:
            file.write(sentence + '\n')

def convert_csv_to_alignment(input_file, output_file):
    aligned_sentences = []
    with open(input_file, 'r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            hut_text = row['Sentence']
            ukr_text = row['Synthetic_Sentence']
            aligned_sentences.append(f"{hut_text} ||| {ukr_text}")

    with open(output_file, 'w', encoding='utf-8') as file:
        for sentence in aligned_sentences:
            file.write(sentence + '\n')

convert_json_to_alignment('ivanchyk.json', 'corpus_ivanchyk.txt')

In [None]:
same_position_count = 0
different_position_count = 0

with open('symmetrized.align', 'r') as file:
    for line in file:
        alignments = line.strip().split()
        
        for alignment in alignments:
            source, target = map(int, alignment.split('-'))
            if source == target or source == target+1 or source == target-1:
                same_position_count += 1
            else:
                different_position_count += 1

print(f"Words aligned in the same position: {same_position_count}")
print(f"Words aligned in different positions: {different_position_count}")

In [None]:
sentence_pairs = []

with open('corpus_ivanchyk.txt', 'r', encoding='utf-8') as file:
    for line in file:
        print(line.strip())
        source, target = line.strip().split(' ||| ')  # Splitting by ' ||| '
        
        source_words = source.split()
        target_words = target.split()
        
        source_lengths = [len(word) for word in source_words]
        target_lengths = [len(word) for word in target_words]
        
        sentence_pairs.append((source_lengths, target_lengths))

In [22]:
sentence_pairs[0]

([3, 2, 4, 6], [4, 3, 2, 7])

In [16]:
alignments = []

with open('symmetrized.align', 'r', encoding='utf-8') as file:
    for line in file:
        alignment_pairs = line.strip().split()
        sentence_alignments = [tuple(map(int, pair.split('-'))) for pair in alignment_pairs]
        
        alignments.append(sentence_alignments)

In [17]:
total_src_chars = 0
total_tgt_chars = 0
unaligned_src_chars = 0
unaligned_tgt_chars = 0
crossing_pairs = 0
total_aligned_pairs = 0

for pair_idx, (src_lengths, tgt_lengths) in enumerate(sentence_pairs):
    aligned_src_indices = set()
    aligned_tgt_indices = set()
    total_src_chars += sum(src_lengths)
    total_tgt_chars += sum(tgt_lengths)
    for (src_idx, tgt_idx) in alignments[pair_idx]:
        if src_idx >= len(src_lengths) or tgt_idx >= len(tgt_lengths):
            continue
        aligned_src_indices.update(range(sum(src_lengths[:src_idx]), sum(src_lengths[:src_idx + 1])))
        aligned_tgt_indices.update(range(sum(tgt_lengths[:tgt_idx]), sum(tgt_lengths[:tgt_idx + 1])))

        for (other_src_idx, other_tgt_idx) in alignments[pair_idx]:
            if (src_idx < other_src_idx and tgt_idx > other_tgt_idx) or (src_idx > other_src_idx and tgt_idx < other_tgt_idx):
                crossing_pairs += 1

    unaligned_src_chars += len([char for idx, char in enumerate(range(sum(src_lengths))) if idx not in aligned_src_indices])
    unaligned_tgt_chars += len([char for idx, char in enumerate(range(sum(tgt_lengths))) if idx not in aligned_tgt_indices])
    total_aligned_pairs += len(alignments[pair_idx])

u_src = unaligned_src_chars / total_src_chars
u_tgt = unaligned_tgt_chars / total_tgt_chars
x = crossing_pairs / total_aligned_pairs if total_aligned_pairs > 0 else 0

print(f"U-src (Proportion of unaligned source characters): {u_src:.3f}")
print(f"U-tgt (Proportion of unaligned target characters): {u_tgt:.3f}")
print(f"X (Proportion of crossing alignment pairs): {x:.3f}")

U-src (Proportion of unaligned source characters): 0.038
U-tgt (Proportion of unaligned target characters): 0.023
X (Proportion of crossing alignment pairs): 0.092
