In [1]:
import json
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm
import spacy
import textstat

In [2]:
# --- Configuration ---
# 1. Define the base paths for your project
BASE_DIR = Path('/clwork/kexin/tsar_st/')
DATA_DIR = BASE_DIR / 'data'
RESULTS_DIR = BASE_DIR / 'results'
OUTPUT_FILE = BASE_DIR / 'proxy_training_data.csv'

# 2. Define the target variable (the real reference score we want to predict)
# Choose between 'bertscore_f1_ref' or 'meaningbert_ref'
TARGET_SCORE_COLUMN = 'meaningbert_ref'

In [3]:
def load_jsonl_to_dict(filepath, key_field, value_field):
    """Loads a .jsonl file into a dictionary for quick lookups."""
    data_dict = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            record = json.loads(line)
            data_dict[record[key_field]] = record[value_field]
    return data_dict

def calculate_features(source_text, candidate_text, nlp_model):
    """Calculates features for a (source, candidate) pair."""
    # Handle potential empty strings to avoid errors
    if not source_text or not candidate_text:
        return {}

    source_doc = nlp_model(source_text)
    candidate_doc = nlp_model(candidate_text)

    features = {
        # Length-based features
        'len_ratio_chars': len(candidate_text) / len(source_text),
        'len_ratio_words': len(candidate_doc) / len(source_doc),
        'abs_len_words': len(candidate_doc),
        
        # Candidate-intrinsic complexity features
        'flesch_reading_ease': textstat.flesch_reading_ease(candidate_text),
        'avg_syl_per_word': textstat.syllable_count(candidate_text, lang='en_US') / max(1, len(candidate_doc)),
        'sentence_count': len(list(candidate_doc.sents)),
    }
    return features

In [4]:
print("🚀 Starting dataset construction...")

# Load the spaCy model once for efficiency
print("Loading spaCy model (this might take a moment)...")
nlp = spacy.load("en_core_web_sm")

# Load the static source documents into a dictionary
print("Loading source documents...")
sources = load_jsonl_to_dict(
    DATA_DIR / 'input/documents.jsonl',
    key_field='text_id',
    value_field='original'
)

all_records = []
run_dirs = [d for d in RESULTS_DIR.iterdir() if d.is_dir()]
print(f"Found {len(run_dirs)} potential experiment directories. Processing...")

for run_dir in tqdm(run_dirs, desc="Processing Runs"):
    simplifications_file = run_dir / 'simplifications.jsonl'
    scores_file = run_dir / 'individual_scores.jsonl'

    if not simplifications_file.exists() or not scores_file.exists():
        continue

    scores_map = {json.loads(line)['text_id']: json.loads(line) for line in open(scores_file, 'r', encoding='utf-8')}
    
    with open(simplifications_file, 'r', encoding='utf-8') as f:
        for line in f:
            candidate_data = json.loads(line)
            text_id = candidate_data['text_id']

            source_text = sources.get(text_id)
            scores = scores_map.get(text_id)
            candidate_text = candidate_data.get('simplified')

            if not all([source_text, scores, candidate_text]):
                continue
            
            target_score = scores.get(TARGET_SCORE_COLUMN)
            if target_score is None:
                continue

            features = calculate_features(source_text, candidate_text, nlp)
            sts_with_source = scores.get('meaningbert_orig')

            record = {
                'text_id': text_id,
                'source': source_text,
                'candidate': candidate_text,
                'target_score': target_score, # Our 'y' variable
                'sts_with_source': sts_with_source, # A powerful feature
                **features
            }
            all_records.append(record)

print(f"\n✅ Processing complete! Total records created: {len(all_records)}")

🚀 Starting dataset construction...
Loading spaCy model (this might take a moment)...
Loading source documents...
Found 41 potential experiment directories. Processing...


Processing Runs:   0%|          | 0/41 [00:00<?, ?it/s]

  'avg_syl_per_word': textstat.syllable_count(candidate_text, lang='en_US') / max(1, len(candidate_doc)),



✅ Processing complete! Total records created: 1601


In [5]:
# Create the final DataFrame
df = pd.DataFrame(all_records)

# Display the first 5 rows
df.head()

# Display summary information about the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1601 entries, 0 to 1600
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   text_id              1601 non-null   object 
 1   source               1601 non-null   object 
 2   candidate            1601 non-null   object 
 3   target_score         1601 non-null   float64
 4   sts_with_source      1601 non-null   float64
 5   len_ratio_chars      1601 non-null   float64
 6   len_ratio_words      1601 non-null   float64
 7   abs_len_words        1601 non-null   int64  
 8   flesch_reading_ease  1601 non-null   float64
 9   avg_syl_per_word     1601 non-null   float64
 10  sentence_count       1601 non-null   int64  
dtypes: float64(6), int64(2), object(3)
memory usage: 137.7+ KB


In [6]:
df.to_csv(OUTPUT_FILE, index=False)

print(f"✅ Success! Dataset saved to {OUTPUT_FILE}")

✅ Success! Dataset saved to /clwork/kexin/tsar_st/proxy_training_data.csv
