# Inspect and preprocess data for data loader

In [2]:
import pyarrow as pa
import pandas as pd


# 1. Inspect data

### Inspect parquet

In [3]:
df = pd.read_parquet("/home/bwilliams/mlx/week2/TwoTowerMLRetrieval/data/ms_marco_train.parquet", engine="fastparquet")
print(df.columns)
print(df.head())
# For one row, inspect types:
row0 = df.iloc[0]
for col in df.columns:
    print(col, type(row0[col]), row0[col])

Index(['answers', 'query', 'query_id', 'query_type', 'wellFormedAnswers',
       'passages.is_selected', 'passages.passage_text', 'passages.url'],
      dtype='object')
                                             answers  \
0  [The immediate impact of the success of the ma...   
1  [Restorative justice that fosters dialogue bet...   
2  [The reasons why Stalin wanted to control East...   
3  [Nails rust in water because water allows the ...   
4    [Depona Ab is a library in Vilhelmina, Sweden.]   

                                               query  query_id   query_type  \
0  )what was the immediate impact of the success ...   1185869  DESCRIPTION   
1  _________ justice is designed to repair the ha...   1185868  DESCRIPTION   
2      why did stalin want control of eastern europe   1185854  DESCRIPTION   
3                             why do nails get rusty   1185755  DESCRIPTION   
4                                          depona ab   1184773  DESCRIPTION   

  wellFormedAnswers

So we got:

| Column                  | Type               | Notes                                                |
| ----------------------- | ------------------ | ---------------------------------------------------- |
| `answers`               | list of strings    | (gold answers—not needed for two-tower training)     |
| `query`                 | string             | the raw query text                                   |
| `query_id`              | int                | unique ID per query                                  |
| `query_type`            | string             | e.g. “DESCRIPTION” (probably not used in our model)  |
| `wellFormedAnswers`     | list of strings    | almost always empty                                  |
| `passages.is_selected`  | list of ints (0/1) | length 10: which of the 10 candidates are “positive” |
| `passages.passage_text` | list of strings    | length 10: the candidate passages                    |
| `passages.url`          | list of strings    | length 10: source URLs (not needed for training)     |

Parts we’ll actually use:

- query → tokenise & embed
- passages.passage_text → tokenise & embed
- passages.is_selected → label positives vs negatives
- (optionally) query_id for bookkeeping


### Inspect word to idx pkl file

In [4]:
import pickle
with open("/home/bwilliams/mlx/week2/TwoTowerMLRetrieval/data/word_to_idx.pkl", "rb") as f:
    word_to_idx = pickle.load(f)
print("Vocab size:", len(word_to_idx))
# Inspect some entries:
for w in list(word_to_idx)[:10]:
    print(w, "->", word_to_idx[w])


Vocab size: 400002
the -> 0
, -> 1
. -> 2
of -> 3
to -> 4
and -> 5
in -> 6
a -> 7
" -> 8
's -> 9


### Inspect embedding npy

In [5]:
import numpy as np
emb = np.load("/home/bwilliams/mlx/week2/TwoTowerMLRetrieval/data/embeddings.npy")
print("Embeddings shape:", emb.shape)  # e.g. (vocab_size, 200)


Embeddings shape: (400002, 200)


# 2. Make some edits
First we add and <unk> and <pad> token to the end of the tokeniser and embeddings

Loads your existing word_to_idx.pkl and embeddings.npy.

Adds <pad> and <unk> at the end of the vocab.

Creates a zero-vector for <pad> and the mean of all existing embeddings for <unk>.

Appends those two new rows to your embeddings matrix.

Saves updated files as word_to_idx_with_pad_unk.pkl and embeddings_with_pad_unk.npy.

In [6]:
import numpy as np
import pickle

# Paths (overwrite originals)
word_to_idx_path = "/home/bwilliams/mlx/week2/TwoTowerMLRetrieval/data/word_to_idx.pkl"
embeddings_path  = "/home/bwilliams/mlx/week2/TwoTowerMLRetrieval/data/embeddings.npy"

# 1. Load originals
with open(word_to_idx_path, "rb") as f:
  word_to_idx = pickle.load(f)
embeddings = np.load(embeddings_path)

# 2. Determine new indices
pad_idx = len(word_to_idx)
unk_idx = pad_idx + 1

# 3. Add tokens to vocab
word_to_idx["<pad>"] = pad_idx
word_to_idx["<unk>"] = unk_idx

# 4. Build new embeddings
emb_dim = embeddings.shape[1]
pad_emb = np.zeros((1, emb_dim), dtype=embeddings.dtype)
unk_emb = embeddings.mean(axis=0, keepdims=True)

# 5. Append and overwrite
updated_embeddings = np.vstack([embeddings, pad_emb, unk_emb])
with open(word_to_idx_path, "wb") as f:
  pickle.dump(word_to_idx, f)
np.save(embeddings_path, updated_embeddings)

print(f"Overwritten with <pad> at {pad_idx}, <unk> at {unk_idx}")
print("Updated shape:", updated_embeddings.shape)


Overwritten with <pad> at 400002, <unk> at 400003
Updated shape: (400004, 200)
