## Loading the data


In [2]:
# Import and reload the data loader module
import importlib
import src.data_loader as dl

# Reload the module to get latest changes
importlib.reload(dl)

# Load the Titanic dataset (using correct path for project root)
df = dl.load_df("data/raw/Titanic-Dataset.csv")
print(df.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [None]:
# summarize data
summary = dl.summarize_df(df)
print(summary)


DATASET SUMMARY
Shape: 891 rows Ã— 12 columns
Memory usage: 0.31 MB

MISSING VALUES:
   Age: 177 (19.9%)
   Cabin: 687 (77.1%)
   Embarked: 2 (0.2%)

DATA TYPES:
   int64: 5 columns
   object: 5 columns
   float64: 2 columns

NUMERIC COLUMNS:
   PassengerId:
      Mean: 446.00
      Std: 257.35
      Min: 1.00
      Max: 891.00
   Survived:
      Mean: 0.38
      Std: 0.49
      Min: 0.00
      Max: 1.00
   Pclass:
      Mean: 2.31
      Std: 0.84
      Min: 1.00
      Max: 3.00
   Age:
      Mean: 29.70
      Std: 14.53
      Min: 0.42
      Max: 80.00
   SibSp:
      Mean: 0.52
      Std: 1.10
      Min: 0.00
      Max: 8.00
   Parch:
      Mean: 0.38
      Std: 0.81
      Min: 0.00
      Max: 6.00
   Fare:
      Mean: 32.20
      Std: 49.69
      Min: 0.00
      Max: 512.33

CATEGORICAL COLUMNS:
   Name: 891 unique values, most common: 'Dooley, Mr. Patrick'
   Sex: 2 unique values, most common: 'male'
   Ticket: 681 unique values, most common: '347082'
   Cabin: 147 unique values, m

## Testing embedding generator


In [5]:
# Test the embedding generator
import src.embeddings as embeddings
importlib.reload(embeddings)

generator = embeddings.EmbeddingGenerator()

# Test single embedding
test_text = "This is a test sentence for embedding generation."
embedding = generator.generate_embedding(test_text)
print(f"Single embedding shape: {embedding.shape}")

# Test batch embeddings
test_texts = [
    "First test sentence",
    "Second test sentence", 
    "Third test sentence"
]
batch_embeddings = generator.generate_embeddings_batch(test_texts)
print(f"Batch embeddings shape: {batch_embeddings.shape}")

# Test similarity
sim = generator.similarity(batch_embeddings[0], batch_embeddings[1])
print(f"Similarity between first two embeddings: {sim:.4f}")

INFO:src.embeddings:Loading embedding model: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:src.embeddings:Model loaded successfully. Embedding dimension: 384


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:src.embeddings:Generating embeddings for 3 texts...


Single embedding shape: (384,)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:src.embeddings:Generated 3 embeddings with dimension 384


Batch embeddings shape: (3, 384)
Similarity between first two embeddings: 0.8852


## Testing vector store and similarity search


In [None]:
# Test the vector store
import src.vector_store as vs
import numpy as np

dimension = 384  # all-MiniLM-L6-v2 dimension

# Create vector store
store = vs.VectorStore(dimension=dimension, index_type="flat", mapping_type="json")

# Create some test vectors
test_vectors = np.random.rand(5, dimension).astype('float32')
test_metadata = [
    {"text": "First document", "category": "A"},
    {"text": "Second document", "category": "B"},
    {"text": "Third document", "category": "A"},
    {"text": "Fourth document", "category": "C"},
    {"text": "Fifth document", "category": "B"}
]

# Add vectors
vector_ids = store.add_vectors(test_vectors, test_metadata)
print(f"Added vectors with IDs: {vector_ids}")

# Search for similar vectors
query = test_vectors[0]  # Use first vector as query
results = store.search(query, k=3)

print("\nSearch results:")
for result in results:
    print(f"ID: {result['id']}, Distance: {result['distance']:.4f}, "
            f"Similarity: {result['similarity']:.4f}, Metadata: {result['metadata']}")

# Get stats
stats = store.get_stats()
print(f"\nStore stats: {stats}")

INFO:src.vector_store:Created FAISS IndexFlatL2
INFO:src.vector_store:VectorStore initialized with 384D vectors, flat index, json mapping
INFO:src.vector_store:Added 5 vectors to index. Total vectors: 5
INFO:src.vector_store:Found 3 similar vectors


Added vectors with IDs: [0, 1, 2, 3, 4]
Query vector shape: (384,)

Search results:
ID: 3, Distance: 62.7210, Similarity: 0.0157, Metadata: {'text': 'Fourth document', 'category': 'C'}
ID: 4, Distance: 63.2829, Similarity: 0.0156, Metadata: {'text': 'Fifth document', 'category': 'B'}
ID: 1, Distance: 63.9098, Similarity: 0.0154, Metadata: {'text': 'Second document', 'category': 'B'}

Store stats: {'vector_count': 5, 'dimension': 384, 'index_type': 'flat', 'mapping_type': 'json', 'is_trained': True}
