<a href="https://colab.research.google.com/github/isabelklint/scrapers/blob/main/create_subsampled_stopwords_list.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import csv
import math
from collections import Counter

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Read in corpus from CSV file
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/corpus.csv")

# Extract text from corpus DataFrame
corpus = df['text']

def generate_stopword_list(corpus, frequency_threshold, probability_threshold):
    """
    Generate a stop word list using subsampling.
    
    Args:
    corpus (list): List of documents in the corpus.
    frequency_threshold (int): Frequency threshold for subsampling.
    probability_threshold (float): Probability threshold for subsampling.
    
    Returns:
    list: List of stop words.
    """
    # Count word frequencies in the corpus
    word_counts = Counter()
    for document in corpus:
        word_counts.update(document.split())
    
    # Subsample words based on frequency and probability thresholds
    stopword_list = []
    for word, count in word_counts.items():
        if count > frequency_threshold:
            subsampling_probability = 1 - math.sqrt(frequency_threshold / count)
            if subsampling_probability > probability_threshold:
                stopword_list.append(word)
    
    return stopword_list

# Generate stopword list using subsampling
stopword_list = generate_stopword_list(corpus, frequency_threshold=2, probability_threshold=0.5)

# Save stopword list as CSV file
with open('/content/drive/My Drive/amharic_subsampled_stop_words_2023.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(stopword_list)
