In [None]:
import os 
import glob
import tiktoken
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader,TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from openai import OpenAI



In [None]:
load_dotenv(override=True)
db_name = "vector_db"

openai_api_key = os.getenv("OPENAI_API_KEY")
if openai_api_key:
  print("It's there fellas")
Model = "gpt-4.1-nano"
openai=OpenAI(api_key=openai_api_key) 

In [None]:
# How many charcters are teher in knowledge base
knowledge_base_path= "**/*.md"
entire_knowledge_base=""
files = glob.glob(knowledge_base_path,recursive=True)
for file in files:
  with open(file,'r',encoding="utf-8") as f:
    entire_knowledge_base+=f.read()
    entire_knowledge_base+="\n\n"
print(f"Total characters in knowledge base: {len(entire_knowledge_base)}")
print(f"Total Words in knowledge base: {len(entire_knowledge_base.split())}")

In [None]:
encoding = tiktoken.encoding_for_model(Model)
tokens = encoding.encode(entire_knowledge_base)
token_count = len(tokens)
print(f"Totla tokens for {Model}:{token_count:,}")


### A quick detour to understand the from langchain_community.document_loaders import TextLoader and DirectoryLoader


DirectoryLoader takes folder struct and loader class to say how the data needs to be read TextLoader is the most common nd simple one for reading text and .md

In [None]:
from langchain_community.document_loaders import TextLoader,PyPDFLoader,DirectoryLoader

loader_mapping = {
  ".md":TextLoader,
  ".pdf":PyPDFLoader
}

loader = DirectoryLoader(
  "./company/",glob="**/*.*",loader_cls=TextLoader,show_progress=True,loader_kwargs={'encoding':'utf-8'}
)


In [None]:
loader.load()

In [None]:
pdf_loader = DirectoryLoader(
    "./company/", 
    glob="**/*.pdf", 
    loader_cls=PyPDFLoader
)


In [None]:
pdf_loader.load()

### A return to reality

In [None]:
folders = glob.glob("./*/")
documents = []
folders


In [None]:
from langchain_community.document_loaders import DirectoryLoader,TextLoader

for folder in folders:
  dir_name = folder.split("/")[-1].replace('//','').replace('\\','').replace('.','')
  print(folder)
  loader = DirectoryLoader(path=folder,glob="**/*.md",loader_cls=TextLoader,loader_kwargs={'encoding':"utf-8"},recursive=True)
  
  folder_docs = loader.load()

  for doc in folder_docs:
    doc.metadata["doc_type"] = dir_name
    documents.append(doc)
print(f"Successfully loaded {len(documents)} documents from {len(folders)} categories.")

In [None]:
documents[0]

Now a detour to chunks RecursiveTextSplitter there are many ways to seperate chunks to get the meaning out of the TEXT. (Notes written)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)

chunks = text_splitter.split_documents(documents)

print(f"Divided into {len(chunks)} chunks")
print(f"First chunk:\n\n{chunks[0]}")

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
if os.path.exists(db_name):
  Chroma(persist_directory=db_name,embedding_function=embeddings).delete_collection()


In [None]:
vectorstore = Chroma.from_documents(documents=chunks,embedding=embeddings,persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
#lets investigate the vectors 
collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1,include=['embeddings'])['embeddings'][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

In [None]:
embeddings = collection.get(include=['embeddings'])['embeddings'][0]
len(embeddings)

In [None]:
import numpy as np
result = collection.get(include=['embeddings','documents','metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['doc_type'] for metadata in metadatas]
colors = [['blue','green','red','orange'] [['products','employees','contracts','company'].index(t)] for t in doc_types]


In [None]:
vectors

In [None]:
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [None]:
 
tsne = TSNE(n_components=2,random_state=42)
reduced_vectors = tsne.fit_transform(vectors)


import matplotlib.pyplot as plt

plt.figure(figsize=(6, 5))
plt.scatter(
    reduced_vectors[:, 0],
    reduced_vectors[:, 1],
    c=colors,
    s=10,
    alpha=0.7
)
plt.title("2D Embedding Plot")
plt.show()



In [None]:
 
tsne = TSNE(n_components=3,random_state=42)
reduced_vectors = tsne.fit_transform(vectors)
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(6, 5))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(
    reduced_vectors[:, 0],
    reduced_vectors[:, 1],
    reduced_vectors[:, 2],
    c=colors,
    s=10,
    alpha=0.7
)

ax.set_title("3D Embedding Plot")
plt.show()


In [None]:
import plotly.express as px
fig = px.scatter(
  x=reduced_vectors[:,0],
  y=reduced_vectors[:,1],
  color = doc_types,
  title="2D Embedding Plot"
)

fig.show()

In [None]:
import plotly.express as px
tsne = TSNE(n_components=3,random_state=42)
reduced_vectors = tsne.fit_transform(vectors)
fig = px.scatter_3d(
  x=reduced_vectors[:,0],
  y=reduced_vectors[:,1],
  z=reduced_vectors[:,2],
  color = doc_types,
  title="3D Embedding Plot"
)

fig.show()

In [None]:
import nbformat
print(nbformat.__version__)


In [None]:
from sklearn.decomposition import PCA

pca_2d = PCA(n_components=3,random_state=42)
embeddings_2d = pca_2d.fit_transform(vectors)

import plotly.express as px


fig = px.scatter_3d(
  x= embeddings_2d[:,0],
  y=embeddings_2d[:,1],
  z=embeddings_2d[:,2],
  color= doc_types,
  title="3D Plotting" 
)

fig.show()

In [None]:
from sklearn.decomposition import PCA

pca_2d = PCA(n_components=2,random_state=42)
embeddings_2d = pca_2d.fit_transform(vectors)

import plotly.express as px


fig_1 = px.scatter(
  x= embeddings_2d[:,0],
  y=embeddings_2d[:,1],
  color= doc_types,
  title="2D Plotting" 
)

fig_1.show()

In [None]:
from sklearn.manifold import TSNE
import plotly.express as ex

tsn_2 =  TSNE(n_components=2,random_state=42)
embeddings_tsne = tsn_2.fit_transform(vectors)

fig = ex.scatter(
  x=embeddings_tsne[:,0],
  y=embeddings_tsne[:,1],
  color= doc_types,
  title="2D plotting TSNE"
)

fig_1.show(),fig.show()
