In [20]:
#import libraries
import os
import nltk
from nltk import bigrams
from collections import Counter, defaultdict
import pandas as pd
import numpy as np

In [21]:
# read text files from a directory
def read_text_files(directory):
    files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.txt')]
    texts = {}
    for file in files:
        with open(file, 'r', encoding='utf-8') as f:
            text = f.read()
            text = text.lower().replace('\n', ' ')      # Lowercase and remove newlines
            texts[file] = text
    return texts

In [22]:
# tokenize and extract bigrams
def tokenize_and_extract_bigrams(texts):
    bigram_freq = Counter()
    author_bigram_freq = defaultdict(Counter)
    for author, text in texts.items():
        tokens = nltk.word_tokenize(text)
        file_bigrams = list(bigrams(tokens))
        bigram_freq.update(file_bigrams)
        author_bigram_freq[author].update(file_bigrams)
    return bigram_freq, author_bigram_freq

In [23]:
# calculate statistics
def calculate_statistics(bigram_freq, author_bigram_freq):
    overall_frequencies = bigram_freq
    num_author_samples = Counter()
    mean_frequency_per_author = {}
    std_dev_frequency = {}

    for bigram in bigram_freq:
        frequencies = [author_bigram_freq[author][bigram] for author in author_bigram_freq if bigram in author_bigram_freq[author]]
        num_author_samples[bigram] = len(frequencies)
        mean_frequency_per_author[bigram] = np.mean(frequencies)
        std_dev_frequency[bigram] = np.std(frequencies)
    
    return overall_frequencies, num_author_samples, mean_frequency_per_author, std_dev_frequency

In [24]:
# function for process the data
def create_dataframe(overall_frequencies, num_author_samples, mean_frequency_per_author, std_dev_frequency):
    data = {
        'ngram': [f"{bigram[0]} {bigram[1]}" for bigram in overall_frequencies],
        'f': list(overall_frequencies.values()),
        'a': [num_author_samples[bigram] for bigram in overall_frequencies],
        'm': [mean_frequency_per_author[bigram] for bigram in overall_frequencies],
        'sd': [std_dev_frequency[bigram] for bigram in overall_frequencies]
    }
    df = pd.DataFrame(data)
    return df

def save_to_csv(df, filename):
    df.to_csv(filename, index=False)
    
def save_sorted_by_authors(df, sorted_filename):
    sorted_df = df.sort_values(by='a', ascending=False)
    sorted_df.to_csv(sorted_filename, index=False)
    
def save_filtered_by_threshold(df, threshold, filtered_filename):
    filtered_df = df[df['a'] <= threshold]
    sorted_filtered_df = filtered_df.sort_values(by='a', ascending=False)
    sorted_filtered_df.to_csv(filtered_filename, index=False)


In [25]:
def merge_csv_files(directory_or_file):
    if os.path.isdir(directory_or_file):
        csv_files = [os.path.join(directory_or_file, f) for f in os.listdir(directory_or_file) if f.endswith('.csv')]
        df_list = [pd.read_csv(f) for f in csv_files]
        merged_df = pd.concat(df_list, ignore_index=True)
    else:
        merged_df = pd.read_csv(directory_or_file)

    return merged_df

In [26]:
def compare_file(output_csv_path, user_input_path):
    output_df = pd.read_csv(output_csv_path)
    user_df = merge_csv_files(user_input_path)
    
    user_df = user_df[['ngram', 'f', 'a', 'm', 'sd']]
    
    merged_df = pd.merge(output_df, user_df, on='ngram', suffixes=('_output', '_user'))
    
    common_bigrams = merged_df[merged_df['a_output'] == merged_df['a_user']]
    
    author_counts = common_bigrams['a_user'].value_counts()
    
    most_matched_author = author_counts.idxmax()
    most_matched_count = author_counts.max()
    
    return most_matched_author, most_matched_count

In [27]:
def sort_bigrams_by_unique_authors(df, sorted_filename):
    sorted_df = df.sort_values(by='a', ascending=False)
    sorted_df.to_csv(sorted_filename, index=False)

In [28]:
def filter_bigrams_by_author_range(df, u, v, filtered_filename):
    filtered_df = df[(df['a'] >= u) & (df['a'] <= v)]
    sorted_filtered_df = filtered_df.sort_values(by='a', ascending=False)
    sorted_filtered_df.to_csv(filtered_filename, index=False)

In [29]:
# main function
def main(directory, output_file, sorted_output_file, threshold, filtered_output_file, user_input_path, sorted_unique_authors_file, u, v, author_range_file):
    texts = read_text_files(directory)
    bigram_freq, author_bigram_freq = tokenize_and_extract_bigrams(texts)
    overall_frequencies, num_author_samples, mean_frequency_per_author, std_dev_frequency = calculate_statistics(bigram_freq, author_bigram_freq)
    df = create_dataframe(overall_frequencies, num_author_samples, mean_frequency_per_author, std_dev_frequency)
    save_to_csv(df, output_file)
    save_sorted_by_authors(df, sorted_output_file)
    save_filtered_by_threshold(df, threshold, filtered_output_file)
    
    most_matched_author, most_matched_count = compare_file(sorted_output_file, user_input_path)
    print(f"Author with the most matches: {most_matched_author} (Matches: {most_matched_count})")
    
    # Sort bigrams by the number of unique authors and save to file
    sort_bigrams_by_unique_authors(df, sorted_unique_authors_file)
    filter_bigrams_by_author_range(df, u, v, author_range_file)

# Directory containing text files
directory = 'source'
# Output CSV file
output_file = 'output.csv'
# Sorted output CSV file
sorted_output_file = 'output_sorted.csv'
# Threshold value for filtering
threshold = 5
# Filtered output CSV file
filtered_output_file = 'output_filtered.csv'
# User input path (can be a CSV file or a directory containing CSV files)
user_input_path = 'test_inp/'

sorted_unique_authors_file = 'bigram_sorted_by_unique_authors.csv'

author_range_file = 'bigram_author_range.csv'
# Author range (u to v) increment comparison/segmented comparison
u = 2
v = 5

# Run the main function
main(directory, output_file, sorted_output_file, threshold, filtered_output_file, user_input_path, sorted_unique_authors_file, u, v, author_range_file)

Author with the most matches: 1 (Matches: 6)
