# Load queries and documents and embed them

In [2]:
import pandas as pd
import numpy as np
import fastparquet

NUMBER_OF_SAMPLES = 10000

# Load and sample data FIRST
print("Loading raw data...")
df = pd.read_parquet("../data/ms_marco_train.parquet", engine='fastparquet')


print(f"📊 Sampling {NUMBER_OF_SAMPLES:,} samples...")
df_sample = df.sample(n=NUMBER_OF_SAMPLES, random_state=42).copy()
print(f"Sampled: {len(df_sample)} samples")
    
print(df.head(10))

# Apply filtering AFTER sampling
print("Filtering data...")
df_filtered = df_sample[
         (df_sample['query'].notna()) &
         (df_sample['query_id'].notna()) &
         (df_sample['query_type'].notna()) &
         (df_sample['passages'].notna())
     ].copy()

print(f"After filtering: {len(df_filtered)} samples")
    

Loading raw data...
📊 Sampling 10,000 samples...
Sampled: 10000 samples
                                             answers  \
0  [The immediate impact of the success of the ma...   
1  [Restorative justice that fosters dialogue bet...   
2  [The reasons why Stalin wanted to control East...   
3  [Nails rust in water because water allows the ...   
4    [Depona Ab is a library in Vilhelmina, Sweden.]   
5                               [No Answer Present.]   
6         [$43,746 for the 2014-2015 academic year.]   
7                     [Before the age of 2–4 years.]   
8  [Americans brush for just under the two minute...   
9                           [Yes, funner is a word.]   

                                               query  query_id   query_type  \
0  )what was the immediate impact of the success ...   1185869  DESCRIPTION   
1  _________ justice is designed to repair the ha...   1185868  DESCRIPTION   
2      why did stalin want control of eastern europe   1185854  DESCRIPTIO

In [None]:

# Create target variable ONCE
print("Creating target variable...")
df_filtered['score_log'] = np.log1p(df_filtered['score'])

# Verify consistency
test_consistency = np.abs(df_filtered['score_log'] - np.log1p(df_filtered['score'])).max()
print(f"Target consistency check: {test_consistency:.10f}")
assert test_consistency < 1e-10, "Target variable inconsistency detected!"

# Create advanced features
df_filtered = create_advanced_features(df_filtered)

# NO DATA AUGMENTATION - Remove this step that was causing corruption
print("ℹ️ Skipping data augmentation to ensure data consistency")

# Load embeddings (skip if TITLE_EMB_DIM = 0)
if config.TITLE_EMB_DIM > 0:
    print("Loading embeddings...")
    word_to_idx, embeddings = load_glove_embeddings()
    
    if word_to_idx is None:
        # Use zero embeddings if cache not available
        print("Using zero embeddings for testing...")
        X_title_embeddings = np.zeros((len(df_filtered), config.TITLE_EMB_DIM), dtype=np.float32)
    else:
        print("Creating title embeddings...")
        X_title_embeddings = np.array([
            title_to_embedding(title, word_to_idx, embeddings)
            for title in df_filtered['title']
        ], dtype=np.float32)
else:
    print("Skipping title embeddings (TITLE_EMB_DIM = 0)")
    X_title_embeddings = np.zeros((len(df_filtered), 0), dtype=np.float32)