<a href="https://colab.research.google.com/github/hellomikelo/hackathon-cohere-qdrant/blob/dev-prototype/bot_development.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Discord bot development 

This notebook goes through the steps to 
1. Create Discord chat text embeddings using Cohere Embed API
2. Set up a vector search engine using Qdrant Cloud

References: 
* Notebook for [Cohere and Qdrant Multilingual Semantic Search Hackathon](https://lablab.ai/event/multilingual-semantic-search-hackathon). 
* Sampled from [Question Answering as a Service with Cohere and Qdrant](https://qdrant.tech/articles/qa-with-cohere-and-qdrant/). 
* Also see [Neural Search Tutorial](https://qdrant.tech/articles/neural-search-tutorial/).
* [Qdrant quickstart](https://qdrant.tech/documentation/quick_start)

# Set up env and run demo

In [38]:
!pip install -q -U discord.py datasets qdrant_client=="0.11.0" cohere python-dotenv

In [39]:
from google.colab import drive
from dotenv import load_dotenv
import os
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [40]:
!cp /content/drive/MyDrive/env/vars.env /content/.env

In [41]:
load_dotenv(override=True)

True

## Build vector search engine

In [42]:
from qdrant_client import QdrantClient
from qdrant_client import models
from qdrant_client.http import models as rest
import cohere
import os
import pandas as pd

In [43]:
CHAT_HISTORY_PATH = '/content/drive/MyDrive/career/projects/hackathons/lablab-cohere-qdrant-hackathon/discord-chat-history.csv'
QDRANT_CLOUD_HOST = "19531f2c-0717-4706-ac90-bd8dd1a6b0cc.us-east-1-0.aws.cloud.qdrant.io"
QDRANT_COLLECTION_NAME = 'discord'

Create function to get channel chat history and save the messages as a CSV. This uses the [Message components](https://discordpy.readthedocs.io/en/stable/api.html#message) .

In [None]:
# %%writefile gethistory.py
import discord
import pandas as pd
import os

@client.event
async def on_message(message):
    if message.author == client.user:
        return
    elif message.content.startswith('_'):
        cmd = message.content.split()[0].replace("_","")
        if len(message.content.split()) > 1:
            parameters = message.content.split()[1:]

        if cmd == 'scan':

            data = pd.DataFrame(columns=['content', 'time', 'author', 'jump_url'])
            
            def is_command(msg): # Checking if the message is a command call
                if len(msg.content) == 0:
                    return False
                elif msg.content.split()[0] == '_scan':
                    return True
                else:
                    return False

            # Set the limit to 10000 msgs
            async for msg in message.channel.history(limit=10000): 
                if msg.author != client.user:                        
                    if not is_command(msg):                          
                        data = data.append({'content': msg.content,
                                            'time': msg.created_at,
                                            'author': msg.author.name,
                                            'jump_url': msg.jump_url,
                                            }, ignore_index=True)
                    # if len(data) == limit:
                    #    break
            
            file_location = "data.csv" # Set the string to where you want the file to be saved to
            data.to_csv(file_location)
            print(f'Chat history saved to {file_location}')

intents = discord.Intents.default()
intents.message_content = True

client = discord.Client(intents=intents)
guild = discord.Guild

client.run(os.getenv('DISCORD_TOKEN'))

Writing gethistory.py


In [70]:
df = pd.read_csv(CHAT_HISTORY_PATH, index_col=0)
df.head()

Unnamed: 0,content,time,author,jump_url
0,There are many cultural differences between th...,2023-03-14 05:11:57.864000+00:00,OpenAI GPT-3,https://discord.com/channels/10848649878870549...
1,<@1058008641959112796> what are some major cul...,2023-03-14 05:11:57.456000+00:00,likemo,https://discord.com/channels/10848649878870549...
2,抱歉，我并不会说中文。如果您有任何关于我的问题或命令的问题，请加入支持服务器：https:/...,2023-03-14 05:11:36.756000+00:00,OpenAI GPT-3,https://discord.com/channels/10848649878870549...
3,,2023-03-14 05:11:36.454000+00:00,OpenAI GPT-3,https://discord.com/channels/10848649878870549...
4,<@1058008641959112796> what are some major cul...,2023-03-14 05:11:35.993000+00:00,likemo,https://discord.com/channels/10848649878870549...


In [106]:
def ingest_chat():
    pass

    
def clean_chat(df):
    """Clean chat history to keep only alphanums and Han Ideographs."""
    _df = df.copy()
    _df['content'] = (_df['content']
                      .str.replace('[^a-zA-Z\u4E00-\u9FFF\s]', '', regex=True)
                      .str.replace('(http\w+|\n)', '', regex=True)
                      .str.lower()
                      .str.strip()
                      .fillna('')
                      )
    return _df

In [107]:
dataset = clean_chat(df)
# df.content[85]

In [169]:
cohere_client = cohere.Client(os.getenv('COHERE_API_KEY'))
# cohere_client

# Embed chat messages
embeddings = cohere_client.embed(
    texts=dataset.content.tolist(),
    # model="large",2
    model='multilingual-22-12',
)

# 
vector_size = len(embeddings.embeddings[0])
vector_size

768

In [170]:
vectors = [list(map(float, vector)) for vector in embeddings.embeddings]
# ids = dataset.jump_url.tolist()
# TODO: Make better IDs
ids = dataset.index.tolist()

In [160]:
qdrant_client = QdrantClient(
    host=QDRANT_CLOUD_HOST, 
    prefer_grpc=False,
    api_key=os.getenv('QDRANT_API_KEY'),
)

In [179]:
# Create Qdrant vector database
# Qdrant allows you to combine vectors of the same purpose into collections. 
# Many independent vector collections can exist on one service at the same time.
qdrant_client.recreate_collection(
    collection_name=QDRANT_COLLECTION_NAME,
    vectors_config=models.VectorParams(
        size=vector_size, 
        distance=rest.Distance.DOT # for multilingual model
        # distance=rest.Distance.COSINE # for large model
    ),
)

In [180]:
# Upsert new embeddings into vector search engine
qdrant_client.upsert(
    collection_name=QDRANT_COLLECTION_NAME, 
    points=rest.Batch(
        ids=ids,
        vectors=vectors,
        payloads=dataset.to_dict(orient='records'),
    )
)

# If uploading for the first time
# qdrant_client.upload_collection(
#     collection_name='startups',
#     vectors=vectors,
#     payload=payload,
#     ids=None,  # Vector ids will be assigned automatically
#     batch_size=256  # How many vectors will be uploaded in a single request?
# )

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

## Test search engine

In [181]:
new_embeddings = cohere_client.embed(
    texts=["discussions on horses", "discussions on asian countries", "interesting dog facts"],
    # model="large",
    model='multilingual-22-12',
)

In [182]:
results = []
k_max = 5

new_vectors = [list(map(float, vector)) for vector in new_embeddings.embeddings]

for embedding in new_vectors:
    response = qdrant_client.search(
        collection_name=QDRANT_COLLECTION_NAME,
        query_vector=embedding,
        limit=k_max,
    )
    results.append([record.payload['content'] for record in response])
results

## Example

In [91]:
from datasets import load_dataset

In [92]:
dataset2 = load_dataset("pubmed_qa", "pqa_labeled")

Downloading builder script:   0%|          | 0.00/11.1k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/12.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/4.58k [00:00<?, ?B/s]

Downloading and preparing dataset pubmed_qa/pqa_labeled to /root/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/dd4c39f031a958c7e782595fa4dd1b1330484e8bbadd4d9212e5046f27e68924...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/709k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/152M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/533M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset pubmed_qa downloaded and prepared to /root/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/dd4c39f031a958c7e782595fa4dd1b1330484e8bbadd4d9212e5046f27e68924. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [110]:
# list(dataset2['train'])
dataset2['train'].to_pandas()

Unnamed: 0,pubid,question,context,long_answer,final_decision
0,21645374,Do mitochondria play a role in remodelling lac...,{'contexts': ['Programmed cell death (PCD) is ...,Results depicted mitochondrial dynamics in viv...,yes
1,16418930,Landolt C and snellen e acuity: differences in...,{'contexts': ['Assessment of visual acuity dep...,"Using the charts described, there was only a s...",no
2,9488747,"Syncope during bathing in infants, a pediatric...",{'contexts': ['Apparent life-threatening event...,"""Aquagenic maladies"" could be a pediatric form...",yes
3,17208539,Are the long-term results of the transanal pul...,{'contexts': ['The transanal endorectal pull-t...,Our long-term study showed significantly bette...,no
4,10808977,Can tailored interventions increase mammograph...,{'contexts': ['Telephone counseling and tailor...,The effects of the intervention were most pron...,yes
...,...,...,...,...,...
995,8921484,Does gestational age misclassification explain...,"{'contexts': ['After 34 weeks gestation, summa...",Gestational age misclassification is an unlike...,no
996,16564683,Is there any interest to perform ultrasonograp...,{'contexts': ['To evaluate the accuracy of ult...,Sonography has no place in the diagnosis of un...,no
997,23147106,Is peak concentration needed in therapeutic dr...,{'contexts': ['We analyzed the pharmacokinetic...,These results suggest little need to use peak ...,no
998,21550158,Can autologous platelet-rich plasma gel enhanc...,{'contexts': ['This investigation assesses the...,"The PRP group recorded reduced pain, swelling,...",yes


In [None]:
import cohere

In [None]:
cohere_client = cohere.Client(os.getenv('COHERE_API_KEY'))
cohere_client

<cohere.client.Client at 0x7f01c16bae20>

In [None]:
embeddings = cohere_client.embed(
    texts=["A test sentence"],
    model="large",
)
vector_size = len(embeddings.embeddings[0])
vector_size

4096

In [None]:
from qdrant_client import QdrantClient
from qdrant_client import models
from qdrant_client.http import models as rest

In [None]:
qdrant_client = QdrantClient(
    host="19531f2c-0717-4706-ac90-bd8dd1a6b0cc.us-east-1-0.aws.cloud.qdrant.io", 
    prefer_grpc=True,
    api_key=os.getenv('QDRANT_API_KEY'),
)

In [177]:
# Create Qdrant vector database
# Qdrant allows you to combine vectors of the same purpose into collections. Many independent vector collections can exist on one service at the same time.
qdrant_client.recreate_collection(
    collection_name="pubmed_qa",
    vectors_config=models.VectorParams(
        size=vector_size, 
        distance=rest.Distance.COSINE # for large model
    ),
)



In [178]:
# Embed strings using Cohere
answer_response = cohere_client.embed(
    texts=dataset["train"]["long_answer"],
    model="large",
)
vectors = [list(map(float, vector)) for vector in answer_response.embeddings]
ids = [entry["pubid"] for entry in dataset["train"]]

KeyError: ignored

In [None]:
# Upsert new embeddings into vector search engine
qdrant_client.upsert(
    collection_name="pubmed_qa", 
    points=rest.Batch(
        ids=ids,
        vectors=vectors,
        payloads=list(dataset["train"]),
    )
)

# If uploading for the first time
# qdrant_client.upload_collection(
#     collection_name='startups',
#     vectors=vectors,
#     payload=payload,
#     ids=None,  # Vector ids will be assigned automatically
#     batch_size=256  # How many vectors will be uploaded in a single request?
# )

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
# Embed new questions
question_response = cohere_client.embed(
    texts=dataset["train"]["question"],
    model="large",
)

In [None]:
from tqdm import tqdm

In [None]:
k_max = 10
answer_positions = []
for embedding, pubid in tqdm(zip(question_response.embeddings, ids)):
    response = qdrant_client.search(
        collection_name="pubmed_qa",
        query_vector=embedding,
        limit=k_max,
    )

    answer_ids = [record.id for record in response]
    if pubid in answer_ids:
        answer_positions.append(answer_ids.index(pubid))
    else:
        answer_positions.append(-1)


for k in range(1, k_max + 1):
    correct_answers = len(
        list(
            filter(lambda x: 0 <= x < k, answer_positions)
        )
    )
    print(f"accuracy@{k} =", correct_answers / len(dataset["train"]))

1000it [01:38, 10.11it/s]

accuracy@1 = 0.877
accuracy@2 = 0.921
accuracy@3 = 0.941
accuracy@4 = 0.95
accuracy@5 = 0.956
accuracy@6 = 0.96
accuracy@7 = 0.964
accuracy@8 = 0.971
accuracy@9 = 0.975
accuracy@10 = 0.976



