In [32]:
#import libraries
import os
import nltk
from nltk import bigrams
from collections import Counter, defaultdict
import pandas as pd
import numpy as np

In [33]:
# read text files from a directory
def read_text_files(directory):
    files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.txt')]
    texts = {}
    for file in files:
        with open(file, 'r', encoding='utf-8') as f:
            text = f.read()
            text = text.lower().replace('\n', ' ')      # Lowercase and remove newlines
            texts[file] = text
    return texts

In [34]:
# tokenize and extract bigrams
def tokenize_and_extract_bigrams(texts):
    bigram_freq = Counter()
    author_bigram_freq = defaultdict(Counter)
    for author, text in texts.items():
        tokens = nltk.word_tokenize(text)
        file_bigrams = list(bigrams(tokens))
        bigram_freq.update(file_bigrams)
        author_bigram_freq[author].update(file_bigrams)
    return bigram_freq, author_bigram_freq

In [35]:
# calculate statistics
def calculate_statistics(bigram_freq, author_bigram_freq):
    overall_frequencies = bigram_freq
    num_author_samples = Counter()
    mean_frequency_per_author = {}
    std_dev_frequency = {}

    for bigram in bigram_freq:
        frequencies = [author_bigram_freq[author][bigram] for author in author_bigram_freq if bigram in author_bigram_freq[author]]
        num_author_samples[bigram] = len(frequencies)
        mean_frequency_per_author[bigram] = np.mean(frequencies)
        std_dev_frequency[bigram] = np.std(frequencies)
    
    return overall_frequencies, num_author_samples, mean_frequency_per_author, std_dev_frequency

In [36]:
# function for process the data
def create_dataframe(overall_frequencies, num_author_samples, mean_frequency_per_author, std_dev_frequency):
    data = {
        'ngram': [f"{bigram[0]} {bigram[1]}" for bigram in overall_frequencies],
        'f': list(overall_frequencies.values()),
        'a': [num_author_samples[bigram] for bigram in overall_frequencies],
        'm': [mean_frequency_per_author[bigram] for bigram in overall_frequencies],
        'sd': [std_dev_frequency[bigram] for bigram in overall_frequencies]
    }
    df = pd.DataFrame(data)
    return df

def save_to_csv(df, filename):
    df.to_csv(filename, index=False)
    
def save_sorted_by_authors(df, sorted_filename):
    sorted_df = df.sort_values(by='a', ascending=False)
    sorted_df.to_csv(sorted_filename, index=False)
    
def save_filtered_by_threshold(df, threshold, filtered_filename):
    filtered_df = df[df['a'] <= threshold]
    sorted_filtered_df = filtered_df.sort_values(by='a', ascending=False)
    sorted_filtered_df.to_csv(filtered_filename, index=False)


In [37]:
# main function
def main(directory, output_file, sorted_output_file, threshold, filtered_output_file):
    texts = read_text_files(directory)
    bigram_freq, author_bigram_freq = tokenize_and_extract_bigrams(texts)
    overall_frequencies, num_author_samples, mean_frequency_per_author, std_dev_frequency = calculate_statistics(bigram_freq, author_bigram_freq)
    df = create_dataframe(overall_frequencies, num_author_samples, mean_frequency_per_author, std_dev_frequency)
    save_to_csv(df, output_file)
    save_sorted_by_authors(df, sorted_output_file)
    save_filtered_by_threshold(df, threshold, filtered_output_file)

# Directory containing text files
directory = 'source'
# Output CSV file
output_file = 'output.csv'
# Sorted output CSV file
sorted_output_file = 'output_sorted.csv'
# Threshold value for filtering
threshold = 5
# Filtered output CSV file
filtered_output_file = 'output_filtered.csv'

# Run the main function
main(directory, output_file, sorted_output_file, threshold, filtered_output_file)