In [None]:
    # Load and sample data FIRST
    print("Loading raw data...")
    df = pd.read_parquet(config.DATA_PATH)
    print(f"📊 Sampling {config.NUMBER_OF_SAMPLES:,} samples...")
    df_sample = df.sample(n=config.NUMBER_OF_SAMPLES, random_state=config.RANDOM_STATE).copy()
    print(f"Sampled: {len(df_sample)} samples")
    
    # Apply filtering AFTER sampling
    print("Filtering data...")
    df_filtered = df_sample[
        (df_sample['score'] >= config.MINIMUM_SCORE) &
        (df_sample['score'] <= config.MAXIMUM_SCORE) &
        (df_sample['title'].notna()) &
        (df_sample['by'].notna()) &
        (df_sample['time'].notna())
    ].copy()
    print(f"After filtering: {len(df_filtered)} samples")
    
    # Create target variable ONCE
    print("Creating target variable...")
    df_filtered['score_log'] = np.log1p(df_filtered['score'])
    
    # Verify consistency
    test_consistency = np.abs(df_filtered['score_log'] - np.log1p(df_filtered['score'])).max()
    print(f"Target consistency check: {test_consistency:.10f}")
    assert test_consistency < 1e-10, "Target variable inconsistency detected!"
    
    # Create advanced features
    df_filtered = create_advanced_features(df_filtered)
    
    # NO DATA AUGMENTATION - Remove this step that was causing corruption
    print("ℹ️ Skipping data augmentation to ensure data consistency")
    
    # Load embeddings (skip if TITLE_EMB_DIM = 0)
    if config.TITLE_EMB_DIM > 0:
        print("Loading embeddings...")
        word_to_idx, embeddings = load_glove_embeddings()
        
        if word_to_idx is None:
            # Use zero embeddings if cache not available
            print("Using zero embeddings for testing...")
            X_title_embeddings = np.zeros((len(df_filtered), config.TITLE_EMB_DIM), dtype=np.float32)
        else:
            print("Creating title embeddings...")
            X_title_embeddings = np.array([
                title_to_embedding(title, word_to_idx, embeddings)
                for title in df_filtered['title']
            ], dtype=np.float32)
    else:
        print("Skipping title embeddings (TITLE_EMB_DIM = 0)")
        X_title_embeddings = np.zeros((len(df_filtered), 0), dtype=np.float32)