### Read .md files

In [4]:
from pathlib import Path 

data_path = Path("data")

list(data_path.glob("*.md"))

[WindowsPath('data/An introduction to the vector database LanceDB.md'),
 WindowsPath('data/API trafiklab (1).md'),
 WindowsPath('data/API trafiklab.md'),
 WindowsPath('data/Azure static web app deploy react app.md'),
 WindowsPath('data/Chat with your excel data - xlwings lite (1).md'),
 WindowsPath('data/Chat with your excel data - xlwings lite.md'),
 WindowsPath('data/Course structure for Azure two weeks course.md'),
 WindowsPath('data/Data platform course structure.md'),
 WindowsPath('data/data processing course  structure.md'),
 WindowsPath('data/data storytelling.md'),
 WindowsPath('data/dbt modeling snowflake.md'),
 WindowsPath('data/docker setup windows.md'),
 WindowsPath('data/FastAPI and scikit-learn API connect to streamlit frontend.md'),
 WindowsPath('data/Fastapi CRUD app.md'),
 WindowsPath('data/Hands on regularization.md'),
 WindowsPath('data/How does LLM work_.md'),
 WindowsPath('data/Logistic regression hands on with scikit learn.md'),
 WindowsPath('data/Logistic regress

### Quick checks
- conform table and columns

In [2]:
import lancedb
from backend.constants import VECTOR_DATABASE_PATH

db = lancedb.connect(uri=VECTOR_DATABASE_PATH)
tbl = db["transcripts"]
df = tbl.to_pandas()
print(df.columns)

Index(['md_id', 'filepath', 'filename', 'content', 'embedding'], dtype='object')


In [3]:
print(df.shape)

(53, 5)


In [4]:
print(df.head(1).T)

                                                           0
md_id         An introduction to the vector database LanceDB
filepath   C:\Users\Katrin\Documents\github\yt-rag-assist...
filename      An introduction to the vector database LanceDB
content    # An introduction to the vector database Lance...
embedding  [-0.038686633, 0.0036908067, 0.02178414, -0.07...


#### check the embedding column and inspect a vector:
- does it exist?
- is it a list/array?
- does the length equal the embedding dim?

In [5]:
emb = df.loc[0, "embedding"]
print(type(emb), len(emb))

<class 'numpy.ndarray'> 3072


#### Sanity check : vector norms and non-zero check
- norms > 0 (not all zeros)
- the should be roughly similar scale.

In [7]:
import numpy as np
embs = df["embedding"].apply(lambda x: np.array(x, dtype=float))
norms = embs.apply(np.linalg.norm)
print("min, median, max norm:", norms.min(), norms.median(), norms.max())

min, median, max norm: 0.9999998880146814 1.0000000087852439 1.0000001546808057
