In [9]:
import os
import glob
import re
from collections import Counter
from langdetect import detect
import langdetect

In [12]:
# A simple set of stopwords (commonly used words that don't add much meaning)
STOPWORDS = {
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", 
    "he", "him", "his", "himself", "she", "her", "hers", "it", "its", "itself", "they", 
    "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", 
    "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", 
    "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", 
    "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", 
    "for", "with", "about", "against", "between", "into", "through", "during", 
    "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", 
    "on", "off", "over", "under", "again", "further", "then", "once", "here", 
    "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", 
    "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", 
    "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", 
    "should", "now"
}

def count_words_and_sentences(content):
    # Word tokenization: split by whitespace and strip punctuation
    words = re.findall(r'\b\w+\b', content.lower())  # Get words using regex
    # Sentence tokenization: split by common sentence-ending punctuation
    sentences = re.split(r'[.!?]+', content)  # Splits on ., !, ?
    return words, [s for s in sentences if s.strip()]

def get_language_distribution(text):
    try:
        return detect(text)
    except langdetect.lang_detect_exception.LangDetectException:
        return "unknown"

def count_words_in_directory(directory_path):
    total_words = []
    total_sentences = 0
    file_count = 0
    language_counter = Counter()

    # Get all markdown files in the directory
    markdown_files = glob.glob(os.path.join(directory_path, '**/*.md'), recursive=True)
    file_count = len(markdown_files)

    for md_file in markdown_files:
        with open(md_file, 'r', encoding='utf-8') as file:
            content = file.read()
            words, sentences = count_words_and_sentences(content)
            total_words.extend(words)
            total_sentences += len(sentences)

            # Detect language
            language = get_language_distribution(content)
            language_counter[language] += 1

    return total_words, total_sentences, file_count, language_counter

def get_file_size_in_mb(directory_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(directory_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size / (1024 * 1024)  # Convert to MB

def get_top_frequent_words(words, top_n=10):
    word_count = Counter(words)
    return word_count.most_common(top_n)

def get_word_frequency_distribution(words):
    word_count = Counter(words)
    return word_count

def stopword_usage(words):
    stopword_count = sum(1 for word in words if word in STOPWORDS)
    return stopword_count, len(STOPWORDS)

# Specify the directory where your markdown files are stored
directory_path = '/home/subin/Desktop/subin/ritsu_bot/markdown_files'

# Collecting all statistics
all_words, total_sentences, file_count, language_counter = count_words_in_directory(directory_path)
total_word_count = len(all_words)
average_word_count = total_word_count / file_count if file_count > 0 else 0
total_file_size_mb = get_file_size_in_mb(directory_path)
unique_word_count = len(set(all_words))
stopword_count, total_stopwords = stopword_usage(all_words)

In [13]:
# Get top 10 most frequent words
top_10_words = get_top_frequent_words(all_words, top_n=10)
word_freq_distribution = get_word_frequency_distribution(all_words)

# Print results
print(f"Total number of document files: {file_count}")
print(f"Total word count: {total_word_count}")
print(f"Total sentence count: {total_sentences}")
print(f"Average word count per document: {average_word_count:.2f}")
print(f"Total storage size of dataset: {total_file_size_mb:.2f} MB")
print(f"Unique words: {unique_word_count}")
print(f"Top 10 most frequent words: {top_10_words}")
print(f"Language distribution: {language_counter}")
print(f"Stopword usage: {stopword_count} stopwords used out of {total_stopwords} available stopwords")

Total number of document files: 196
Total word count: 196835
Total sentence count: 10387
Average word count per document: 1004.26
Total storage size of dataset: 1.26 MB
Unique words: 8126
Top 10 most frequent words: [('td', 19134), ('1', 11276), ('colspan', 9902), ('the', 6046), ('of', 3787), ('and', 3489), ('to', 3353), ('in', 2719), ('valign', 2718), ('tr', 2312)]
Language distribution: Counter({'en': 196})
Stopword usage: 43544 stopwords used out of 124 available stopwords
