In [6]:
%pip install replicate chromadb==0.4.15



In [7]:
import json

import pandas as pd
import replicate
from replicate import Client
import chromadb

# Create Dataset in Replicate

Following this tutorial: https://replicate.com/blog/how-to-use-rag-with-chromadb-and-mistral-7b-instruct

And using all this CSV file: CSV_FILE = "https://gist.githubusercontent.com/jacKlinc/8b71c404c9f8545d33daf20bb4555fd3/raw/ce55f5eb33c32bac73d768ba5193ea5c4cc6aef8/all-bellingcat-articles.csv"

**Plan**
1. Load CSV file from GitHub
2. Convert to JSON format
3. Upload to Replicate



In [8]:
API_TOKEN = "r8_Ae4T8X5wlSAwfjK37pX3iImMg7aWy120fXXvO"
COLLECTION_NAME = "all-bellingcat-articles"

## Load CSV

In [9]:
CSV_FILE = "https://gist.githubusercontent.com/jacKlinc/8b71c404c9f8545d33daf20bb4555fd3/raw/ce55f5eb33c32bac73d768ba5193ea5c4cc6aef8/all-bellingcat-articles.csv"
df = pd.read_csv(CSV_FILE)
df.drop(columns=["year", "month", "path"], inplace=True)
df = df[["publish_date", "title", "url", "articles_text"]]
df["id"] = df.index
df.head(2)

Unnamed: 0,publish_date,title,url,articles_text,id
0,2014-07-31,Did Coulson’s News of the World Incite Others ...,https://www.bellingcat.com/news/uk-and-europe/...,"\n\nMore on the Fake Sheikh, the Police, and N...",0
1,2014-07-30,The Context of Caryatid,https://www.bellingcat.com/news/uk-and-europe/...,- Part 1\n\nOver the past two years my “regul...,1


## Convert to JSON

In [10]:
bellingcat_json = json.loads(df.to_json(orient="records"))
bellingcat_json[0]

{'publish_date': '2014-07-31',
 'title': 'Did Coulson’s News of the World Incite Others to Commit Crimes and Cause Unsafe Convictions?',
 'url': 'https://www.bellingcat.com/news/uk-and-europe/2014/07/31/did-coulsons-news-of-the-world-incite-others-to-commit-crimes-and-cause-unsafe-convictions/',
 'articles_text': '\n\nMore on the Fake Sheikh, the Police, and News of the World by occasional blogger @jpublik.\n\nAndy Coulson‘s News of the World sent a man to jail after luring him to sell them drugs he was terrified of carrying by promising him a job. He was sentenced to four years in prison before his conviction was quashed – after he’d already served his time.\n\nIn a case which has hardly received any publicity, according to high court documents, Albanian Besnik Qema was asked to supply News of the World cocaine and a passport on a promise of job as security for a wealthy Arab family.\n\nThe High Court documents detail how in January 2005, Mazher Mahmood had asked Florim Gashi, a conta

## Create Collection

In [34]:

# Initialize the chromadb directory, and client.
client = chromadb.PersistentClient(path="./chromadb")
collection = client.get_or_create_collection(name=COLLECTION_NAME)
replicate_client = Client(api_token=API_TOKEN)

# Generate embeddings, and index titles in batches of 250.
batch_size = 250

# Use tqdm to show a friendly progress bar.
for i in range(0, len(bellingcat_json), batch_size):
    # set end position of batch
    i_end = min(i + batch_size, len(bellingcat_json))

    # Get next batch of 250 lines
    batch = bellingcat_json[i : i + batch_size]

    # When storing data in Chromadb, we construct a list of titles, ids, and
    # metadata.
    # NOTE: It is important that each of these lists is the same size, and that
    # each list index position corresponds with the others.
    batch_titles = [story["title"] for story in batch]
    # batch_article = [story["articles_text"] for story in batch]
    batch_ids = [str(story["id"]) for story in batch]
    batch_metadata = [dict(time=story["publish_date"]) for story in batch]

    # Generate embeddings, 250 titles at a time.
    batch_embeddings = replicate_client.run(
        "nateraw/bge-large-en-v1.5:9cf9f015a9cb9c61d1a2610659cdac4a4ca222f2d3707a68517b18c198a9add1",
        input={"texts": json.dumps(batch_titles)},
    )

    # Upsert all of the embeddings, ids, metadata, and title strings into Chromadb.
    collection.upsert(
        ids=batch_ids,
        metadatas=batch_metadata,
        documents=batch_titles,
        embeddings=batch_embeddings,
    )

## Query Collection

In [37]:
def query_collection(query: str, collection):
    # Perform the Chromadb query.
    results = collection.query(query_texts=[query], n_results=10)

    # Create a string from all of the results
    return "\n".join(results["documents"][0])
    # return results

# This function will be used to convert the query string to embeddings, so we can
# perform a similarity search against the embedding space.
#
# This is configured to use the bge-large-en-v1.5 embeddings model
def generate_embeddings(texts):
    return replicate_client.run(
        "nateraw/bge-large-en-v1.5:9cf9f015a9cb9c61d1a2610659cdac4a4ca222f2d3707a68517b18c198a9add1",
        input={"texts": json.dumps(texts)},
    )

collection = client.get_or_create_collection(name=COLLECTION_NAME, embedding_function=generate_embeddings)
query_collection("where is navalni", collection)

"Update: ROK Jeju Island Naval Base\nRussia's Foreign Fighters: Geolocating the Nepalis Training in the Russian Army\nGrain Trail: Tracking Russia's Ghost Ships with Satellite Imagery\nIran ISOICO Shipyard Imagery Update\nTracking the Nigerian Armed Forces' COIN offensive in North-East Nigeria\nWhere’s Babiš? Geolocating the Alleged Father-Son Kidnapping Mystery\nSamos And The Anatomy Of A Maritime Push-Back\nNew Google Earth Satellite Update Confirms Presence of Buk in Eastern Ukraine\nThe Mysterious Disappearance of Jeannette Island (on Google Maps)\nCivilian Shields in Donetsk: Launching Grads near a Residential Area"

## RAG

The Buk launcher is a missile launcher that was used to shoot down Malaysia Airlines flight MH17 over eastern Ukraine in July 2014. There have been many investigations into the launcher, including the discovery of new images and sightsings of it in Ukraine and Russia. Some articles have suggested that Russian claims about the launcher are false, and there have been in-depth analyses of videos and other evidence related to the launcher. Additionally, there have been reports of new launch areas for the launcher in the Luhansk region of Ukraine, and size estimates of missiles displayed in recent North Korean military parades. Finally, there have been reports of a training facility in Eastern Ukraine that uses the Buk launcher.


In [None]:
import json
import os

import streamlit as st
import pandas as pd
from replicate import Client
from dotenv import load_dotenv

# import chromadb

from ..types import Page, Article
from ..constants import CSV_FILE, EMBEDDINGS_MODEL, COLLECTION_NAME

load_dotenv(override=True)


REPLICATE_API_TOKEN = os.getenv("REPLICATE_API_TOKEN")


class Bellingcat(Page):
    # Initialize the chromadb directory, and client.
    # client = chromadb.PersistentClient(path="./chromadb")
    # collection = client.get_or_create_collection(name=COLLECTION_NAME)
    replicate_client = Client(api_token=REPLICATE_API_TOKEN)

    def write(self):
        st.title(self.__class__.__name__)

        st.write("## Download CSV")
        df = self.download_csv(CSV_FILE)
        st.dataframe(df)

        st.write("## Convert to JSON")
        articles = self.convert_to_json(df)
        st.dataframe(articles[0])

    def download_csv(self, csv_file: str) -> pd.DataFrame:
        df = pd.read_csv(csv_file)
        df.drop(columns=["year", "month", "path"], inplace=True)
        df = df[["publish_date", "title", "url", "articles_text"]]
        df["id"] = df.index
        return df

    def convert_to_json(self, df: pd.DataFrame) -> list[Article]:
        return json.loads(df.to_json(orient="records"))

    def create_chroma_collection(self, articles: list[Article], batch_size: int = 250):

        for i in range(0, len(articles), batch_size):
            batch = articles[i : i + batch_size]

            # When storing data in Chromadb, we construct a list of titles, ids, and metadata.
            batch_titles = [story["title"] for story in batch]
            batch_ids = [str(story["id"]) for story in batch]
            batch_metadata = [dict(time=story["publish_date"]) for story in batch]

            # Generate embeddings, 250 titles at a time.
            batch_embeddings = self.replicate_client.run(
                EMBEDDINGS_MODEL, input={"texts": json.dumps(batch_titles)}
            )

            # Upsert all of the embeddings, ids, metadata, and title strings into Chromadb.
            # self.collection.upsert(
            #     ids=batch_ids,
            #     metadatas=batch_metadata,
            #     documents=batch_titles,
            #     embeddings=batch_embeddings,
            # )

    def query_collection(self, query: str, collection):
        # Perform the Chromadb query.
        results = collection.query(query_texts=[query], n_results=10)

        # Create a string from all of the results
        return "\n".join(results["documents"][0])

    def query_replicate(self, user_prompt: str) -> str:
        # Query Chromadb for the 10 most similar titles to the user prompt.
        relevant_artcles = self.query_collection(user_prompt, self.collection)

        # LLM Prompt template.
        # NOTE: The [INST] and [/INST] tags are required for mistral-7b-instruct to leverage instruction fine-tuning.
        prompt_template = f"""[INST]
        You are an expert in all things Bellingcat. Your goal is to give me a summary of the top results. You will be given a USER_PROMPT, and a series of RELEVANT_ARTICLES.

        USER_PROMPT: {user_prompt}

        RELEVANT_ARTICLES: {relevant_artcles}

        SUGGESTIONS:

        [/INST]
        """
        MISTRAL_URL = "a16z-infra/mistral-7b-instruct-v0.1:83b6a56e7c828e667f21fd596c338fd4f0039b46bcfa18d973e8e70e455fda70"
        # Prompt the mistral-7b-instruct LLM
        mistral_response = self.replicate_client.run(
            MISTRAL_URL,
            input={
                "prompt": prompt_template,
                "temperature": 0.75,
                "max_new_tokens": 2048,
            },
        )

        # Concatenate the response into a single string.
        suggestions = "".join([str(s) for s in mistral_response])

        return suggestions
