In [1]:
%pip install kaleido python-multipart langchain chromadb sentence-transformers

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-multipart
  Downloading python_multipart-0.0.9-py3-none-any.whl (22 kB)
Collecting langchain
  Downloading langchain-0.1.7-py3-none-any.whl (815 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.9/815.9 kB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting chromadb
  Downloading chromadb-0.4.22-py3-none-any.whl (509 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.0/509.0 kB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers
  Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)

In [None]:

import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS, Chroma
from langchain.document_loaders import DirectoryLoader

import chromadb
from google.colab import drive
drive.mount('/content/drive')

# Dataset Preprocessing

The data in its current state cannot be fed to an LLM. It needs to be organised into a prompt format.

In [3]:
df = pd.read_csv("all-bellingcat-articles.csv")
df.drop(columns=["year", "month", "path"], inplace=True)
df = df[["publish_date", "title", "url", "articles_text"]]
df.head(2)

FileNotFoundError: [Errno 2] No such file or directory: 'all-bellingcat-articles.csv'

In [None]:
df.info()

## Add new columns

1. Article date
2. Article title
3. Article word count
4. Article character count

In [None]:
article = df["articles_text"][590]
article

### Word Count


In [None]:
word_count = len(article.split())
word_count

In [None]:
df["word_count"] = df["articles_text"].map(lambda x: len(x.split()))

### Word Count


In [None]:
len(article)

In [None]:
df["character_count"] = df["articles_text"].map(len)

In [None]:
df

## Text Embeddings


💡 **Vector** array of numbers


💡 **Vector Embedding** way of representing other data like words in vector form

💡 **Vector Database** relational database can be queried based on relations while embeddings use probabilistics similiarities. This makes it very fast and more appropritate for AI apps


- Where this applies to LLMs is that a vector database can be created with custom data
- This gives the LLM long-term memory and can retreive documents from the custom DB

In [None]:
import os
from google.colab import userdata

HUGGINGFACEHUB_API_TOKEN = "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"

### Init Embedding


In [None]:
def initialize_embeddings():
    model_identifier = "sentence-transformers/all-mpnet-base-v2"
    print(">>>Embeddings setup completed successfully<<<")
    return HuggingFaceEmbeddings(model_name=model_identifier)

### Embedding


In [None]:
from dataclasses import dataclass


@dataclass
class ArticleMetaData:
    publish_date: str
    title: str
    url: str


@dataclass
class Document:
    page_content: str
    metadata: ArticleMetaData

In [None]:
docs = df.apply(
    lambda x: Document(
        x["articles_text"], {
            "publish_date":x["publish_date"],
            "title":x["title"],
            "url":x["url"]
          }
    ),
    axis=1,
)
docs[0].metadata

In [None]:
def process_and_embed_docs(docs: list[Document], hf_model):
    # doc_loader = DirectoryLoader(dir_path)
    # loaded_docs = doc_loader.load()
    # splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
    # split_docs = splitter.split_documents(loaded_docs)
    database = Chroma.from_documents(documents=docs, embedding=hf_model)
    # database = Chroma.from_text(df, embedding=hf_model)
    print(">>>Embedding and chunking process completed successfully<<<")
    return database


def concatenate_documents(document_list):
    combined_content = "".join([doc.page_content for doc in document_list])
    print(">>>Few-shot prompting process completed successfully<<<")
    print(">>>Prompt engineering process completed successfully<<<")
    return combined_content

In [None]:
hf = initialize_embeddings()

db = process_and_embed_docs(docs, hf)

In [None]:
db