In [None]:
import random
import numpy as np
from collections import defaultdict
from google.colab import drive

drive.mount('/content/drive')

# Path to the file
file_path = '/content/drive/My Drive/papers/structuring_dictionaries/ru_kg_udahin.txt'

# Read the file content
with open(file_path, 'r', encoding='utf-8') as file:
    entries = file.readlines()

Mounted at /content/drive


In [None]:
# Calculate lengths of each entry
entry_lengths = [len(entry) for entry in entries]

# Group entries by their lengths
length_groups = defaultdict(list)
for entry, length in zip(entries, entry_lengths):
    length_groups[length].append(entry)


In [None]:

# Determine the total number of entries
total_entries = len(entries)

# Determine the number of entries to sample
sample_size = 520

# Calculate proportions
proportions = {length: len(group) / total_entries for length, group in length_groups.items()}

# Sample proportionally
sampled_entries = []
for length, group in length_groups.items():
    n_samples = int(proportions[length] * sample_size)
    sampled_entries.extend(random.sample(group, n_samples))

# If there are any discrepancies due to rounding, adjust the sample size accordingly
while len(sampled_entries) < sample_size:
    additional_sample = random.choice(entries)
    sampled_entries.append(additional_sample)

# Shuffle the sampled entries to ensure randomness
random.shuffle(sampled_entries)

# Print or save the sampled entries
print(sampled_entries[:10])  # Example to show the first 10 sampled entries

['чукчанка\tженск. р. к чукча.\n', 'чесальный,\t\xadая, -ое\\nтароочу, тарагыч;\\nчесальная машина тарагыч машина.\n', 'босиком\tнареч.\\nжыңайлак, жыңалаяк.\n', 'легально\tнареч.\\nлегалдуу (закондуу, законго ылайык, ашкере).\n', 'выбывать\tнесов.\\nсм. выбыть.\n', 'потворщица\tженск. р. к потворщик.\n', 'маргаритка\tж.\\nмаргаритка (гүлү жылдызга окшош өсүмдүктүн бир түрү).\n', 'игольник\tм.\\nийне сайгыч же ийне салгыч куту.\n', 'непривычка\tж. разг.\\nкөнбөгөндүк, адаттанбагандык, тажрыйбасыздык;\\nс непривычки көнбөгөндүктөн, адаттанбагандыктан.\n', 'вполголоса\tнареч.\\nкүбүрөп, акырын үн менен, кыңылдап;\\nпеть вполголоса кыңылдап ырдоо.\n']


In [None]:
sampled_entries[-2]

'отсохнуть\tсов.\\n1. куурап түшүү;\\nветка отсохла бутак куурап түштү;\\n2. разг. сенек болуп кууроо;\\nу него рука отсохла анын колу сенек болуп, куурап калды.\n'

In [None]:
# Split the list into two halves
half_size = len(sampled_entries) // 2
sampled_entries_part1 = sampled_entries[:half_size]
sampled_entries_part2 = sampled_entries[half_size:]

# Save the sampled entries to two different files
output_path_part1 = '/content/drive/My Drive/papers/structuring_dictionaries/sample_part1.txt'
output_path_part2 = '/content/drive/My Drive/papers/structuring_dictionaries/sample_part2.txt'

with open(output_path_part1, 'w', encoding='utf-8') as file1:
    file1.write('\n'.join(sampled_entries_part1))

with open(output_path_part2, 'w', encoding='utf-8') as file2:
    file2.write('\n'.join(sampled_entries_part2))

print(f"Sampled entries saved to {output_path_part1} and {output_path_part2}")


Sampled entries saved to /content/drive/My Drive/papers/structuring_dictionaries/sample_part1.txt and /content/drive/My Drive/papers/structuring_dictionaries/sample_part2.txt


In [None]:
len(set(entry_lengths))

935

In [None]:
min(entry_lengths), max(entry_lengths)

(12, 6603)

## Stratified buckets

We create buckets based on length ranges and then perform stratified sampling within these buckets.
This way we make sure that each bucket is proportionally represented in the sample and maintain the overall distribution of entry lengths.

Algorithmic steps to do that:

    Calculate the length of each entry.
    Define 100 buckets based on the range of lengths.
    Group entries into these buckets.
    Calculate the proportion of entries in each bucket.
    Sample proportionally from each bucket.

Complete script implementing this:

In [None]:
import random
import numpy as np
from collections import defaultdict
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Path to the file
file_path = '/content/drive/My Drive/papers/structuring_dictionaries/ru_kg_udahin.txt'

# Read the file content
with open(file_path, 'r', encoding='utf-8') as file:
    entries = file.readlines()

# Strip entries to remove any leading/trailing whitespaces
entries = [entry.strip() for entry in entries]

# Calculate lengths of each entry
entry_lengths = [len(entry) for entry in entries]

# Determine min and max lengths
min_length = min(entry_lengths)
max_length = max(entry_lengths)

# Define 100 buckets based on the range of lengths
num_buckets = 100
bucket_size = (max_length - min_length) / num_buckets
buckets = defaultdict(list)

for entry, length in zip(entries, entry_lengths):
    bucket_index = int((length - min_length) // bucket_size)
    buckets[bucket_index].append(entry)

# Determine the total number of entries
total_entries = len(entries)

# Determine the number of entries to sample
sample_size = 500

# Calculate proportions
proportions = {bucket: len(group) / total_entries for bucket, group in buckets.items()}

# Sample proportionally from each bucket
sampled_entries = []
for bucket, group in buckets.items():
    n_samples = int(proportions[bucket] * sample_size)
    sampled_entries.extend(random.sample(group, min(n_samples, len(group))))

# Adjust sample size if necessary
while len(sampled_entries) < sample_size:
    additional_sample = random.choice(entries)
    if additional_sample not in sampled_entries:
        sampled_entries.append(additional_sample)

# Shuffle the sampled entries to ensure randomness
random.shuffle(sampled_entries)

Mounted at /content/drive


# 500 sentences

In [None]:
# Split the list into two halves
half_size = len(sampled_entries) // 2
sampled_entries_part1 = sampled_entries[:half_size]
sampled_entries_part2 = sampled_entries[half_size:]

# Save the sampled entries to two different files
output_path_part1 = '/content/drive/My Drive/papers/structuring_dictionaries/sample_part1.txt'
output_path_part2 = '/content/drive/My Drive/papers/structuring_dictionaries/sample_part2.txt'

with open(output_path_part1, 'w', encoding='utf-8') as file1:
    file1.write('\n'.join(sampled_entries_part1))

with open(output_path_part2, 'w', encoding='utf-8') as file2:
    file2.write('\n'.join(sampled_entries_part2))

print(f"Sampled entries saved to {output_path_part1} and {output_path_part2}")

# 12 000 sentences

In [None]:
import random
import numpy as np
from collections import defaultdict
from google.colab import drive

def mount_drive():
    drive.mount('/content/drive')

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        entries = file.readlines()
    return [entry.strip() for entry in entries]

def calculate_entry_lengths(entries):
    return [len(entry) for entry in entries]

def create_buckets(entries, entry_lengths, num_buckets=100):
    min_length = min(entry_lengths)
    max_length = max(entry_lengths)
    bucket_size = (max_length - min_length) / num_buckets
    buckets = defaultdict(list)

    for entry, length in zip(entries, entry_lengths):
        bucket_index = int((length - min_length) // bucket_size)
        buckets[bucket_index].append(entry)

    return buckets

def calculate_proportions(buckets, total_entries):
    return {bucket: len(group) / total_entries for bucket, group in buckets.items()}

def sample_proportionally(buckets, proportions, sample_size):
    sampled_entries = []
    for bucket, group in buckets.items():
        n_samples = int(proportions[bucket] * sample_size)
        sampled_entries.extend(random.sample(group, min(n_samples, len(group))))
    return sampled_entries

def adjust_sample_size(sampled_entries, entries, target_size):
    while len(sampled_entries) < target_size:
        additional_sample = random.choice(entries)
        if additional_sample not in sampled_entries:
            sampled_entries.append(additional_sample)
    return sampled_entries

def main(file_path, sample_size):
    mount_drive()
    entries = read_file(file_path)
    entry_lengths = calculate_entry_lengths(entries)
    buckets = create_buckets(entries, entry_lengths)
    total_entries = len(entries)
    proportions = calculate_proportions(buckets, total_entries)
    sampled_entries = sample_proportionally(buckets, proportions, sample_size)
    sampled_entries = adjust_sample_size(sampled_entries, entries, sample_size)
    random.shuffle(sampled_entries)
    return sampled_entries

In [None]:
file_path = '/content/drive/My Drive/papers/structuring_dictionaries/ru_kg_udahin.txt'
sample_size = 12000
result = main(file_path, sample_size)
print(f"Үлгүнүн өлчөмү: {len(result)}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Үлгүнүн өлчөмү: 12000


In [None]:
# Save the sampled entries to a file
output_path = '/content/drive/My Drive/papers/structuring_dictionaries/12000.txt'

with open(output_path, 'w', encoding='utf-8') as file1:
    file1.write('\n'.join(result))