In [1]:
# start by importing some libraries

import sqlite3 # to access the chat log and to save the processed db
import hashlib # we will use this to create a unique ID 

In [3]:
# Connect to the SQLite database containing the chat data (raw)
# we are using for this example a small toy chat log with 
# around 200 interactions

db_path = 'chat_data.db'
conn = sqlite3.connect(db_path)
c = conn.cursor()

# Fetch all records from the `chats` table
c.execute("SELECT rowid, created_time, message, role FROM chats ORDER BY created_time ASC")
rows = c.fetchall()

if not rows:
    print("No chat records found.")

print ("found " + str(len(rows)) + " chat records")

found 202 chat records


### SESSION SPLITTER
Converting conversational chat sequential data into a suitable format for vectorization and other semantic search or analysis methods can be challenging. One of the primary challenges is dealing with the presence of multiple sessions within the same stream of messages. It's crucial to establish a clear separation between these sessions.

Below, you'll find a function designed to process these messages and split them into sessions. Sessions are defined based on a time interval of N seconds between consecutive messages. In this implementation, sessions are represented as sequences of messages stored within a list. Each element in the list represents one session, and the original message format is preserved, maintaining consistency even after retrieval from the SQLite database.


In [4]:
def group_into_sessions(chat_list, N=7200):
    sessions = []
    current_session = []
    prev_time = None

    for row in chat_list:
        rowid, created_time, message, role = row

        if prev_time is None:
            current_session.append(row)
        else:
            time_diff = created_time - prev_time
            if time_diff <= N:
                current_session.append(row)
            else:
                if current_session:
                    sessions.append(current_session)
                current_session = [row]

        prev_time = created_time

    if current_session:  # Don't forget the last session
        sessions.append(current_session)

    return sessions

sessions = group_into_sessions(rows)

sessions[0]

[(1, 1694016081.9047267, 'hi', 'user'),
 (2,
  1694016082.5320373,
  'Hello! How can I assist you today? Would you like to talk about yourself and share some personal details?',
  'assistant'),
 (3, 1694016086.2061265, 'sure', 'user'),
 (4,
  1694016086.696093,
  "Great! Let's start with some basic information. What is your full name?",
  'assistant'),
 (5, 1694016093.8838253, 'Walter Gonzalez', 'user'),
 (6,
  1694016094.6792994,
  'Thank you, Walter. Can you tell me a bit about your family? Do you have any siblings?',
  'assistant'),
 (7,
  1694016106.904001,
  'one sister I have, yes, Rocio, she lives in Argentina',
  'user'),
 (8,
  1694016107.749361,
  "That's interesting. Do you have a good relationship with your sister? How often do you communicate with her?",
  'assistant'),
 (9, 1694016113.7442048, 'we speak every week', 'user'),
 (10,
  1694016114.5667543,
  "That's nice to hear. Moving on, can you tell me about your occupation? What do you do for a living?",
  'assistant'),


### Chunk creation

**Chunks** represent coherent blocks of text that are treated as single entities for vectorization and analysis.

Through experimentation, I've determined that combining 12 messages (arranged in 4 sets, as explained later) strikes a good balance between the availability of tokens and the semantic richness of a chat log.

I further divide each session into smaller units called **chatBits**, with each chatBit comprising 3 messages. These chatBits are then grouped into sets of 4 to form a **chunk**, resulting in a total of 12 messages (including both bot and user messages) within each chunk.

To ensure the continuity of the conversation, I also introduce an overlap of 3 messages at both ends of each chunk. This overlap ensures that at least one chunk captures the complete flow of the conversation, maintaining context and coherence.

In [10]:
#creation of chunks combining 4 sets of 3 messages, with one set overlap on each end 

def organize_into_chunks(sessions):
    all_chunks = []
    
    for session in sessions:
        # Step 1: Divide session into chatbits
        chatbits = [session[i:i + 3] for i in range(0, len(session), 3)]

        # Step 2: Create chunks
        chunks = []
        i = 0
        while i < len(chatbits):
            chunk = chatbits[i:i + 4]  # A chunk with 4 chatbits
            if len(chunk) < 3 and chunks:  # If a chunk has fewer than 3 chatbits, merge with the previous chunk
                chunks[-1].extend(chunk)
            else:
                chunks.append(chunk)
            i += 3  # Move the index by 3 to have one overlapping chatbit

        # Step 3: Store chunks into the all_chunks list
        all_chunks.extend(chunks)

    return all_chunks



# Call the function to organize into chunks
chunks = organize_into_chunks(sessions)

# lets have a look at the 1st 2 chunks
print (chunks[0])
print ("--")
print (chunks[1])

[[(1, 1694016081.9047267, 'hi', 'user'), (2, 1694016082.5320373, 'Hello! How can I assist you today? Would you like to talk about yourself and share some personal details?', 'assistant'), (3, 1694016086.2061265, 'sure', 'user')], [(4, 1694016086.696093, "Great! Let's start with some basic information. What is your full name?", 'assistant'), (5, 1694016093.8838253, 'Walter Gonzalez', 'user'), (6, 1694016094.6792994, 'Thank you, Walter. Can you tell me a bit about your family? Do you have any siblings?', 'assistant')], [(7, 1694016106.904001, 'one sister I have, yes, Rocio, she lives in Argentina', 'user'), (8, 1694016107.749361, "That's interesting. Do you have a good relationship with your sister? How often do you communicate with her?", 'assistant'), (9, 1694016113.7442048, 'we speak every week', 'user')], [(10, 1694016114.5667543, "That's nice to hear. Moving on, can you tell me about your occupation? What do you do for a living?", 'assistant'), (11, 1694016169.3995764, 'I am a data 

Once the chunks have been generated, the next step involves creating unified pieces of text. In this process, I include the appropriate names for both the user (Walter) and the bot (Aleph). This step is essential for the vectorization process.

Additionally, I incorporate timestamps into the unified text. These timestamps serve a dual purpose: first, they allow for easy tracking back to the original chat segment, and second, they establish an order criteria for the conversation segments. All of this information is efficiently stored using a dictionary data structure.

In [12]:
def format_chunks(chunks, user_name="User", assistant_name="Assistant"):
    formatted_chunks = []

    for chunk in chunks:
   
        merged_text = ""
        first_message_time = None

        for chatbit in chunk:
            for rowid, created_time, message, role in chatbit:
                if first_message_time is None:
                    first_message_time = created_time

                prefix = user_name if role == "user" else assistant_name
                merged_text += f"{prefix}: {message}\n"

        formatted_chunks.append({
            "text": merged_text.strip(),  # Removing the last newline
            "ts": first_message_time
        })

    return formatted_chunks

formatted_chunks = format_chunks(chunks, user_name="Walter", assistant_name="Aleph")

formatted_chunks[0]

{'text': "Walter: hi\nAleph: Hello! How can I assist you today? Would you like to talk about yourself and share some personal details?\nWalter: sure\nAleph: Great! Let's start with some basic information. What is your full name?\nWalter: Walter Gonzalez\nAleph: Thank you, Walter. Can you tell me a bit about your family? Do you have any siblings?\nWalter: one sister I have, yes, Rocio, she lives in Argentina\nAleph: That's interesting. Do you have a good relationship with your sister? How often do you communicate with her?\nWalter: we speak every week\nAleph: That's nice to hear. Moving on, can you tell me about your occupation? What do you do for a living?\nWalter: I am a data scientist\nAleph: That's fascinating! What inspired you to become a data scientist?",
 'ts': 1694016081.9047267}

Although chunks have timestamps, I have created a unique code to prevent any potential ID duplication. To achieve this, I combine the resulting integer from multiplying the timestamp by 1e7 with an MD5 hash of the chunk itself. 
MD5, which stands for "Message Digest Algorithm 5," is a widely used cryptographic hash function. It's designed to take an input (or message) of arbitrary length and produce a fixed-size, 128-bit (16-byte) hash value, often represented as a 32-character hexadecimal number. 

In [13]:
def add_chunk_id(formatted_chunks):
    for chunk in formatted_chunks:
        # Remove decimals from the timestamp by multiplying
        ts = int(chunk['ts'] * 1e7)

        # Create a hash of the text
        text_hash = hashlib.md5(chunk['text'].encode()).hexdigest()

        # Combine the two to form a unique chunk_id
        chunk_id = f"{ts}_{text_hash}"

        # Add the chunk_id to the dictionary
        chunk['chunk_id'] = chunk_id

add_chunk_id(formatted_chunks)

formatted_chunks[0]

{'text': "Walter: hi\nAleph: Hello! How can I assist you today? Would you like to talk about yourself and share some personal details?\nWalter: sure\nAleph: Great! Let's start with some basic information. What is your full name?\nWalter: Walter Gonzalez\nAleph: Thank you, Walter. Can you tell me a bit about your family? Do you have any siblings?\nWalter: one sister I have, yes, Rocio, she lives in Argentina\nAleph: That's interesting. Do you have a good relationship with your sister? How often do you communicate with her?\nWalter: we speak every week\nAleph: That's nice to hear. Moving on, can you tell me about your occupation? What do you do for a living?\nWalter: I am a data scientist\nAleph: That's fascinating! What inspired you to become a data scientist?",
 'ts': 1694016081.9047267,
 'chunk_id': '16940160819047268_8c8833ccc24a9a2550745af12e465da8'}

Now I have added a short summary of each chunk. For most of the semantic applications, the summary is sufficient, has les noise and consumes way less tokens. We do this by using a custom System Message and a Summarizer Prompt template via OpenAI API

In [15]:
# init llm azure endpoint

import openai
import json


file_path = 'keys.json'

with open(file_path, 'r') as json_file:
    data = json.load(json_file)
    endpoint_name = data.get('ENDPOINT_NAME')
    key = data.get('KEY')
    deployment_name = data.get('DEPLOYMENT_NAME')
    print(endpoint_name, key, deployment_name)


def init_llm():

    file_path = 'keys.json'
    #endpoint_name, key, deployment_name = read_keys_from_json(file_path)

    openai.api_key = key
    openai.api_base = endpoint_name
    openai.api_type = 'azure'  # Necessary for using the OpenAI library with Azure OpenAI

    # https://learn.microsoft.com/en-US/azure/ai-services/openai/reference
    openai.api_version = '2023-05-15'  # Latest / target version of the API

    

#read_keys_from_json(file_path)
init_llm()

SUMMARIZER_SYS_MSG='''\
Given the conversations between Aleph and Walter, develop a comprehensive summary of Walter's key insights, thoughts, and shared information. \
The aim is to capture a representative 'mind-map' of Walter. \
Accentuate on his main concepts, specific examples, viewpoints, and suggested solutions. \
Remember, this information will be used for semantic searches in our database of summaries, so detail and accuracy are paramount.'''

SUMMARIZER_USER_MSG_TEMPLATE='''\
This is the conversation to summarize.\n\
Consider that the conversation may have an arbitrary start and and and it is a slice of a larger conversation, so you can ignore unanswered questions or comments that make no sense.\n\
Focus on the complete information. Provide a brief summary as text in one sentence, with no aditional coments:\
\n
####conversation begins####\n\
{}\
####conversation ends####\
'''

def get_summary(text):
    
    messages = [
        {"role": "system", "content": SUMMARIZER_SYS_MSG},
        {"role": "user", "content": SUMMARIZER_USER_MSG_TEMPLATE.format(text)}
    ]

    response = openai.ChatCompletion.create(
        engine=deployment_name,
        messages=messages, 
        temperature = 0,
        max_tokens=1000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    return response['choices'][0]['message']['content']




def summarize_chunks_with_get_summary(formatted_chunks):
    summarized_chunks = []

    for chunk in formatted_chunks:
        # Fetch the text of the chunk
        chunk_text = chunk['text']
        print ("****")
        print (chunk_text[:120])

        
        # Use the get_summary function to get the summary
        summary = get_summary(chunk_text)

        # Add the summary to the chunk dictionary
        chunk['summary'] = summary
        print ("----")
        print (summary[:120])

        summarized_chunks.append(chunk)

    return summarized_chunks

summarized_chunks = summarize_chunks_with_get_summary(formatted_chunks)




https://waltertest.openai.azure.com/ 6e76c11df06842c5ba411417b6457d4e my_deplo
****
Walter: hi
Aleph: Hello! How can I assist you today? Would you like to talk about yourself and share some personal detai
----
Walter Gonzalez is a data scientist who has a good relationship with his sister Rocio, who lives in Argentina, and they 
****
Aleph: That's nice to hear. Moving on, can you tell me about your occupation? What do you do for a living?
Walter: I am 
----
Walter is a data scientist working on AI projects with LLMs, and his goal is to achieve infinite or human-like memory fo
****
Walter: I would like to achieve something like infinite memory or human-like memory for you
Aleph: That's a very ambitio
----
Walter aims to achieve infinite or human-like memory for Aleph, and he also enjoys cooking, particularly BBQ, homemade p
****
Aleph: Sure, no problem. Can you tell me about your personality? How would you describe yourself?
Walter: introverted, m
----
Walter describes himself as introv

In [21]:
# lets have a look

print ("chat chunk:")
print (summarized_chunks[4]["text"])
print ("--")
print ("summary:")
print (summarized_chunks[4]["summary"])

chat chunk:
Walter: I live in Amsterdam
Aleph: That's great! What do you like most about living in Amsterdam?
Walter: the city is amazing, and dutch people is great
Aleph: That's wonderful to hear. Amsterdam is definitely a beautiful city with a lot of history and culture. Is there anything else you would like to share about your experience living in Amsterdam?
Walter: this year we have a very warm end of summer
Aleph: That's nice to hear. It's always great to enjoy good weather and make the most of the outdoors. Is there anything else I can assist you with today, Walter?
Walter: you keep asking that, but I expect you ask me concrete questions 
Aleph: I apologize if my questions are not specific enough. Is there anything in particular you would like to talk about or any specific topic you would like me to ask you about?
Walter: that does not sound concrete, Aleph
Aleph: You're right, Walter. Let me try to be more specific. Can you tell me about a recent project you worked on as a data 

Now I add embeddings based on the summaries

In [22]:
def generate_serialized_embeddings(text):
    response = openai.Embedding.create(
        input=text,
        engine="embedding-deplo"
    )
    return json.dumps(response["data"][0]["embedding"])

# Iterate through each dictionary in the list
for chunk in summarized_chunks:
    combined_text = chunk['text'] + "\n\n"+chunk['summary']
    embedding = generate_serialized_embeddings(combined_text)
    chunk['embedding'] = embedding



In [26]:
# now the chunks object has everything we need for the memory composite, let's have a look to the 1st one
# notice the embedidng vector
summarized_chunks[0]

{'text': "Walter: hi\nAleph: Hello! How can I assist you today? Would you like to talk about yourself and share some personal details?\nWalter: sure\nAleph: Great! Let's start with some basic information. What is your full name?\nWalter: Walter Gonzalez\nAleph: Thank you, Walter. Can you tell me a bit about your family? Do you have any siblings?\nWalter: one sister I have, yes, Rocio, she lives in Argentina\nAleph: That's interesting. Do you have a good relationship with your sister? How often do you communicate with her?\nWalter: we speak every week\nAleph: That's nice to hear. Moving on, can you tell me about your occupation? What do you do for a living?\nWalter: I am a data scientist\nAleph: That's fascinating! What inspired you to become a data scientist?",
 'ts': 1694016081.9047267,
 'chunk_id': '16940160819047268_8c8833ccc24a9a2550745af12e465da8',
 'summary': 'Walter Gonzalez is a data scientist who has a good relationship with his sister Rocio, who lives in Argentina, and they s

This code defines a function called save_to_db that is responsible for saving a list of dictionaries (representing data chunks) into an SQLite database, making it useful for storing and organizing information for later retrieval and analysis.


In [27]:
# save chunks to the composite db

def save_to_db(dicts, db_name="memory_composite.db"):
    # Connect to the SQLite database (it will be created if it doesn't exist)
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    # Create the table if it doesn't exist
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS memory (
        text TEXT,
        ts REAL,
        chunk_id TEXT,
        summary TEXT,
        embedding TEXT
    );
    """)
    conn.commit()

    # Insert the dictionaries into the table
    for d in dicts:
        # Serialize embedding list to text, assuming that it is JSON serializable
        embedding_json = json.dumps(d['embedding'])
        cursor.execute("INSERT INTO memory (text, ts, chunk_id, summary, embedding) VALUES (?, ?, ?, ?, ?)",
                       (d['text'], d['ts'], d['chunk_id'], d['summary'], embedding_json))

    # Commit changes and close the connection
    conn.commit()
    conn.close()

# Save to database
save_to_db(summarized_chunks)

Up to this point we have created a memory composite db with embeddings and texts, both raw conversations and summaries. 
Now I will add an in memory vector store and proceed to show an example of semantic search over the chat corpus.
For this I will use Chroma vector DB.

In [28]:
# first I write a helping function to prepare the data from the memory composite for the vector db 

import sqlite3
import ast

def read_from_db(db_name="memory_composite.db"):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    cursor.execute("SELECT * FROM memory")
    rows = cursor.fetchall()

    keys = ["text", "ts", "chunk_id", "summary", "embedding"]
    list_of_dicts = [dict(zip(keys, row)) for row in rows]

    for d in list_of_dicts:
        # Deserialize once to remove the outer quotes
        intermediate = ast.literal_eval(d['embedding'])
        
        # If it's still a string, deserialize again
        if isinstance(intermediate, str):
            d['embedding'] = ast.literal_eval(intermediate)
        else:
            d['embedding'] = intermediate

    conn.close()
    return list_of_dicts

# Test the function and check the type
list_of_dicts = read_from_db()

list_of_dicts

[{'text': "Walter: hi\nAleph: Hello! How can I assist you today? Would you like to talk about yourself and share some personal details?\nWalter: sure\nAleph: Great! Let's start with some basic information. What is your full name?\nWalter: Walter Gonzalez\nAleph: Thank you, Walter. Can you tell me a bit about your family? Do you have any siblings?\nWalter: one sister I have, yes, Rocio, she lives in Argentina\nAleph: That's interesting. Do you have a good relationship with your sister? How often do you communicate with her?\nWalter: we speak every week\nAleph: That's nice to hear. Moving on, can you tell me about your occupation? What do you do for a living?\nWalter: I am a data scientist\nAleph: That's fascinating! What inspired you to become a data scientist?",
  'ts': 1694016081.9047267,
  'chunk_id': '16940160819047268_8c8833ccc24a9a2550745af12e465da8',
  'summary': 'Walter Gonzalez is a data scientist who has a good relationship with his sister Rocio, who lives in Argentina, and th

In [31]:
# now I proceed with the creation of the chroma collection
# for reference check https://www.trychroma.com/

import chromadb
from chromadb.utils import embedding_functions

chroma_client = chromadb.PersistentClient()



openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=key,
                api_base="xxxxxxxxxx", # put your credentials here
                api_type="azure",
                api_version='2023-05-15',
                model_name="text-embedding-ada-002"
            )

def format_for_chromadb(list_of_dicts):
    embeddings = []
    documents = []
    metadatas = []
    ids = []

    for d in list_of_dicts:
        embeddings.append(d['embedding'])  # This should now be a list, not a string
        documents.append(d['text'])
        metadatas.append({"summary": d['summary']})
        ids.append(d['chunk_id'])

    return embeddings, documents, metadatas, ids



embeddings, documents, metadatas, ids = format_for_chromadb(list_of_dicts)

collection = chroma_client.create_collection(name="my_collection", embedding_function=openai_ef)

collection.add(
    embeddings=embeddings,
    documents=documents,
    metadatas=metadatas,
    ids=ids
)


In [32]:
# now let's make a short query - asume we have a topic detection engine that senses that "personal projects" is something you need to expand

query_text = "personal projects"

def generate_serialized_embeddings(text):
    response = openai.Embedding.create(
        input=text,
        engine="embedding-deplo"
    )
    return response["data"][0]["embedding"]

collection.query(
    query_embeddings=[generate_serialized_embeddings(query_text)],
    n_results=2
)

{'ids': [['16940162191807976_179fd7cccacbf758ae050e4fce75ecff',
   '16940348877817180_506a6a17a9c6cd802f2736f911fc13f0']],
 'distances': [[0.44439717623114705, 0.4467541594999836]],
 'metadatas': [[{'summary': "Walter aims to achieve infinite or human-like memory for Aleph, and he also enjoys cooking, particularly BBQ, homemade pizza, and Middle Eastern dishes, but he doesn't have any favorite recipes or techniques, and he describes himself as introverted."},
   {'summary': 'Walter is a data scientist interested in AI, particularly in generative AI with language models and knowledge mining, and is building a personal chatbot called Aleph that will have virtually infinite memory and will be able to know him personally, but faces challenges of representation and long-term memory, and is exploring using multi-agent systems and graph representations of conversations for dynamic retrieval of information.'}]],
 'embeddings': None,
 'documents': [["Walter: I would like to achieve something li