In [9]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from typing import List, Dict, Tuple
import os
from pathlib import Path

# --- Configuration ---
# Set the current directory as the document source
DOCUMENTS_DIR = Path('.')
# Define the Query and Vocabulary Size limit
QUERY_TERMS = ["computer", "game", "china","software"]
VOCABULARY_SIZE = 10
# ---------------------

def load_documents_from_disk(directory: Path) -> Dict[str, str]:
    """Dynamically finds and loads content from all .txt files in the specified directory."""
    docs_content = {}

    # Iterate through all .txt files in the directory
    for file_path in directory.glob('*.txt'):
        try:
            # Skip files that might be part of an operating system or non-article files
            if file_path.name.startswith('article'):
                with open(file_path, 'r', encoding='utf-8') as f:
                    docs_content[file_path.name] = f.read()
        except Exception as e:
            print(f"Error reading file {file_path.name}: {e}")

    # If fewer than 10 documents are found, adjust the count
    if len(docs_content) == 0:
        print("Error: No .txt files found. Please ensure your files are named 'articleX.txt' and are in the same directory.")
    elif len(docs_content) < 10:
        print(f"Warning: Only {len(docs_content)} documents found. Calculations will use this number (N={len(docs_content)}).")

    return docs_content

def preprocess(text: str) -> List[str]:
    """Tokenizes text to lowercase words, removing punctuation."""
    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^\w\s]', ' ', text.lower())
    # Split by whitespace
    return text.split()

def build_tf_matrix(docs_content: Dict[str, str], vocabulary_size: int, query_terms: List[str]) -> Tuple[pd.DataFrame, List[str]]:
    """Generates the raw TF matrix and selects the top 'vocabulary_size' terms, ensuring query terms are included."""
    doc_tfs = {}

    # 1. Calculate raw TF for all words in all documents
    for doc_name, content in docs_content.items():
        tokens = preprocess(content)
        doc_tfs[doc_name] = Counter(tokens)

    # 2. Determine the overall vocabulary (top terms)
    all_words = Counter()
    for tf_counts in doc_tfs.values():
        all_words.update(tf_counts)

    # Define common stopwords to exclude from the meaningful vocabulary
    stopwords = {'the', 'a', 'of', 'and', 'to', 'in', 'is', 'that', 'with', 'or', 'by', 'as', 'it', 'for', 'are', 'which', 'its', 'from', 'on', 'can','an'}

    # Get all words sorted by frequency, excluding stopwords
    meaningful_words = [word for word, count in all_words.most_common() if word not in stopwords and word not in query_terms]

    # Final Vocabulary: Query terms + top N-K meaningful words (where K is number of query terms)
    num_other_words = vocabulary_size - len(query_terms)
    final_vocab = query_terms + meaningful_words[:num_other_words]

    # 3. Build the final TF DataFrame
    df_tf_data = {}
    for term in final_vocab:
        df_tf_data[term] = [doc_tfs[doc_name].get(term, 0) for doc_name in docs_content.keys()]

    df_tf = pd.DataFrame(df_tf_data, index=list(docs_content.keys()))

    return df_tf, final_vocab

# TF-IDF Calculation Functions (same as before)
def calculate_log_tf(tf_matrix: pd.DataFrame) -> pd.DataFrame:
    """Calculates Log Term Frequency: log(1 + TF)"""
    return np.log1p(tf_matrix)

def calculate_idf(tf_matrix: pd.DataFrame, N: int) -> pd.Series:
    """Calculates Inverse Document Frequency: log(N / DF)"""
    df = (tf_matrix > 0).sum(axis=0)
    idf = np.log(N / df)
    return idf

def calculate_tfidf_scores(log_tf_matrix: pd.DataFrame, idf_series: pd.Series) -> pd.DataFrame:
    """Calculates TF-IDF: log_tf * idf"""
    return log_tf_matrix * idf_series

def calculate_query_score(tfidf_matrix: pd.DataFrame, query_terms: List[str]) -> pd.Series:
    """Calculates the Document Score (Sum of TF-IDF for query terms, assuming Query TF = 1)."""
    relevant_terms = [t for t in query_terms if t in tfidf_matrix.columns]

    if not relevant_terms:
        print("Warning: None of the query terms were found in the calculated vocabulary.")
        return pd.Series(0.0, index=tfidf_matrix.index)

    tfidf_query_subset = tfidf_matrix.loc[:, relevant_terms]
    scores = tfidf_query_subset.sum(axis=1)

    return scores

# --- Main Execution Block ---

# 1. Load Documents
DOCUMENTS_CONTENT = load_documents_from_disk(DOCUMENTS_DIR)
N = len(DOCUMENTS_CONTENT)

if N > 0:
    # 2. Build the Raw TF Matrix
    df_tf_raw, calculated_vocabulary = build_tf_matrix(
        DOCUMENTS_CONTENT,
        vocabulary_size=VOCABULARY_SIZE,
        query_terms=QUERY_TERMS
    )

    # 3. Perform TF-IDF Calculations
    df_log_tf = calculate_log_tf(df_tf_raw)
    idf_scores = calculate_idf(df_tf_raw, N)
    df_tfidf = calculate_tfidf_scores(df_log_tf, idf_scores)
    query_scores = calculate_query_score(df_tfidf, QUERY_TERMS)

    # 4. Prepare Final Output
    df_output = df_tfidf.copy()
    df_output.insert(0, 'Query_Score', query_scores)
    df_output_sorted = df_output.sort_values(by='Query_Score', ascending=False)


    # --- 5. Presentation ---

    print("## Input Table: Raw Term Frequency (TF)")
    print(f"**Total Documents (N):** {N}")
    print(f"**Query:** {' + '.join(QUERY_TERMS)}")
    print(f"**Vocabulary (10 terms):** {calculated_vocabulary}")
    print("---")
    print(df_tf_raw)

    print("\n" + "="*80 + "\n")

    print("## Intermediate Results: IDF Scores")
    print("IDF (Inverse Document Frequency): High scores indicate rare terms.")
    print("---")
    print(idf_scores.sort_values(ascending=False))

    print("\n" + "="*80 + "\n")

    print("## Final Output Table: Log TF-IDF and Query Score")
    print("Table is sorted by 'Query_Score' (relevance).")
    print("---")
    print(df_output_sorted)

## Input Table: Raw Term Frequency (TF)
**Total Documents (N):** 10
**Query:** computer + game + china + software
**Vocabulary (10 terms):** ['computer', 'game', 'china', 'software', 'data', 'football', 'games', 'was', 'video', 's']
---
               computer  game  china  software  data  football  games  was  \
article1.txt          0     1      0         0     0        32      3    1   
article3.txt          1     0      0         3     0         0      0    0   
article5.txt          1     0      0         0     0         0      0    0   
article4.txt          7     0      0         1     0         0      0    0   
article2.txt          5    14      0         0     0         0     18    2   
article6.txt          0     0      0         4    44         0      0    2   
article9.txt          0     0      0         0     0         0      0    1   
article8.txt          0     0     14         0     0         0      0    7   
article10.txt         0     3      0         0     0         