# TF-IDF Analysis Results

This notebook visualizes the results from TF-IDF baseline experiments.

**Configurations tested:**
- Text types: `original` (raw MinerU markdown) vs `clean` (normalized markdown)
- Targets:
  - `binary`: accept (1) vs reject (0)
  - `decision`: 4-class (reject, poster, spotlight, oral)
  - `citation`: citations_normalized_by_year (percentile rank within year)

In [None]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

output_base = Path("generated_tf_idf_analysis")

configs = [
    ("original", "binary"),
    ("original", "decision"),
    ("original", "citation"),
    ("clean", "binary"),
    ("clean", "decision"),
    ("clean", "citation"),
]

## 1. Load All Results

In [None]:
all_results = []
for text_type, target in configs:
    results_path = output_base / text_type / target / "test_results.csv"
    if results_path.exists():
        df = pd.read_csv(results_path)
        all_results.append(df)
        print(f"Loaded: {text_type}/{target}")
    else:
        print(f"Missing: {text_type}/{target}")

if all_results:
    combined = pd.concat(all_results, ignore_index=True)
    print(f"\nTotal experiments: {len(combined)}")
else:
    print("No results found. Run sbatch/tf_idf_grid_search.sbatch first.")

## 2. Results Table

In [None]:
if all_results:
    display(combined)

## 3. Classification Results (Binary & Decision)

For binary and decision targets, we report precision, recall, F1 using binary mapping:
- accept (poster, spotlight, oral) = 1
- reject = 0

In [None]:
if all_results:
    classification_results = combined[combined['target'].isin(['binary', 'decision'])]
    if len(classification_results) > 0:
        print("Classification Results (accept vs reject metrics):")
        print("="*60)
        display(classification_results[['text_type', 'target', 'accuracy', 'precision', 'recall', 'f1', 'train_size', 'test_size']])

## 4. Regression Results (Citation)

For citation prediction, we report Pearson correlation coefficient.

In [None]:
if all_results:
    regression_results = combined[combined['target'] == 'citation']
    if len(regression_results) > 0:
        print("Citation Prediction Results:")
        print("="*60)
        display(regression_results[['text_type', 'target', 'correlation', 'p_value', 'train_size', 'test_size']])

## 5. Top Features for Binary Classification (Accept vs Reject)

In [None]:
for text_type in ['original', 'clean']:
    features_path = output_base / text_type / "binary" / "generated_features.csv"
    if features_path.exists():
        df = pd.read_csv(features_path)
        print(f"\n{'='*60}")
        print(f"Top Features - {text_type.upper()} text (Binary)")
        print(f"{'='*60}")
        print(f"\nTop 20 ACCEPT features:")
        print(df['accept'].tolist())
        print(f"\nTop 20 REJECT features:")
        print(df['reject'].tolist())

## 6. Top Features for Decision Classification (4-class)

In [None]:
for text_type in ['original', 'clean']:
    features_path = output_base / text_type / "decision" / "generated_features.csv"
    if features_path.exists():
        df = pd.read_csv(features_path)
        print(f"\n{'='*60}")
        print(f"Top Features - {text_type.upper()} text (Decision)")
        print(f"{'='*60}")
        for col in df.columns:
            print(f"\nTop 20 {col.upper()} features:")
            print(df[col].dropna().tolist())

## 7. Top Features by Citation Quartile

In [None]:
for text_type in ['original', 'clean']:
    features_path = output_base / text_type / "citation" / "generated_features.csv"
    if features_path.exists():
        df = pd.read_csv(features_path)
        print(f"\n{'='*60}")
        print(f"Top Features by Citation Quartile - {text_type.upper()} text")
        print(f"{'='*60}")
        for col in df.columns:
            print(f"\nQuartile {col} (top 15):")
            print(df[col].dropna().tolist())

## 8. Comparison: Original vs Clean Text

In [None]:
if all_results and len(combined) > 0:
    # Pivot for comparison
    comparison = combined.pivot(index='target', columns='text_type', values='f1')
    if 'f1' in combined.columns:
        print("F1 Score Comparison: Original vs Clean")
        print("="*40)
        display(comparison)
        
        # Bar chart
        if len(comparison.dropna()) > 0:
            comparison.plot(kind='bar', figsize=(8, 5))
            plt.title('F1 Score: Original vs Clean Text')
            plt.ylabel('F1 Score')
            plt.xlabel('Target')
            plt.legend(title='Text Type')
            plt.xticks(rotation=0)
            plt.tight_layout()
            plt.show()