In [None]:
#install necessary packages
!pip install --upgrade pip
!pip install psycopg2 
!pip install pgvector 

In [None]:
#import libraries
import boto3
import quip
import json
import time
import psycopg2
from pgvector.psycopg2 import register_vector
from bs4 import BeautifulSoup
from textwrap import wrap

In [None]:
#setup function to recurse through Quip child objects to find all sub-folders and threads

maxRetries = 5

def getChildren(quipClient, folderId):
    
    retryDelay = 2
    
    folders = []
    threads = []

    root = {}
    
    #needed to handle API rate limiting
    for retry in range(maxRetries):
        try:
            root = quipClient.get_folder(folderId)
        except Exception as e:
            if retry < maxRetries - 1 and "429" in e.args[0]:
                print(f"Error {e.args[0]}. Waiting {pow(retryDelay, retry)} second(s) for retry.")
                time.sleep(retryDelay)
                continue
            else:
                raise
        break
        
    for child in root.get('children'):
        if child.get('folder_id') and child.get('folder_id') not in folders:
            folders.append(child.get('folder_id'))
            children = getChildren(quipClient, child.get('folder_id'))
            folders += children[0]
            threads += children[1]
        elif child.get('thread_id') and child.get('thread_id') not in threads:
            threads.append(child.get('thread_id'))
        
    return folders, threads

In [None]:
#setup AWS services
secretsManager = boto3.client(
    service_name = 'secretsmanager',
    region_name = 'us-east-1'
)

bedrock_runtime = boto3.client(
    service_name = 'bedrock-runtime', 
    region_name = 'us-east-1'
)

In [None]:
#get Quip token
secretId = 'quip'
secretsManagerResponse = secretsManager.get_secret_value(SecretId = secretId)['SecretString']
secretsManagerSecret = json.loads(secretsManagerResponse)

In [None]:
#setup Quip python client
quipClient = quip.QuipClient(
    access_token = secretsManagerSecret.get('token'),
    base_url = secretsManagerSecret.get('url'),
    request_timeout = 100)

rootFolderId = 'b3OrOvuPRH8E' 
rootFolder = quipClient.get_folder(rootFolderId)

In [None]:
#recurse through the root folders child objects to find all sub-folders and threads
folders, threads = getChildren(quipClient, rootFolderId)

In [None]:
#get database info from Secrets Manager
dbSecretId = 'quipsearcher'
dbSecretsManagerResponse = secretsManager.get_secret_value(SecretId = dbSecretId)['SecretString']
dbSecretsManagerSecret = json.loads(dbSecretsManagerResponse)

In [None]:
#connect to database
dbClient = psycopg2.connect(
    host = dbSecretsManagerSecret.get('host'),
    port = dbSecretsManagerSecret.get('port'),
    database = dbSecretsManagerSecret.get('database'),
    user = dbSecretsManagerSecret.get('username'),
    password = dbSecretsManagerSecret.get('password')
)

dbClient.set_session(autocommit = True)
dbCursor = dbClient.cursor()

In [None]:
#setup database vector support
dbCursor.execute("CREATE EXTENSION IF NOT EXISTS vector;")
register_vector(dbClient)

In [None]:
#create table and index
dbCursor.execute(f"DROP TABLE IF EXISTS t_{rootFolderId};")

dbCursor.execute(f"""CREATE TABLE IF NOT EXISTS t_{rootFolderId}(
    thread text,
    section int,
    length int,
    title text,
    link text,
    text text,
    embeddings vector(1536),
    PRIMARY KEY(thread, section)
);""")

dbCursor.execute(f"""CREATE INDEX ON t_{rootFolderId}
    USING GIN (to_tsvector('simple', text))
;""")

In [None]:
#store embeddings and identifying thread info in table
maxSectionLen = 1000

for threadId in threads:
    
    thread = {}
    
    #needed to handle API rate limiting
    for retry in range(maxRetries):
    try:
        thread = quipClient.get_thread(threadId)
    except Exception as e:
        if retry < maxRetries - 1 and "429" in e.args[0]:
            print(f"Error {e.args[0]}. Waiting {pow(retryDelay, retry)} second(s) for retry.")
            time.sleep(retryDelay)
            continue
        else:
            raise
    break
    
    sections = wrap(BeautifulSoup(thread.get('html'), 'html.parser').text, maxSectionLen)
    
    sectionId = 0
    
    for section in sections:
        
        sectionJson = json.dumps({
            "inputText": f"{rootFolder.get('folder').get('title')} - {thread.get('thread').get('title')}\n{section}",
        })

        sectionEmbeddingResponse = bedrock_runtime.invoke_model(
            body = sectionJson, 
            modelId = 'amazon.titan-embed-text-v1', 
            accept = 'application/json', 
            contentType = 'application/json'
        )
        sectionEmbeddingResponseBody = json.loads(sectionEmbeddingResponse['body'].read())
        sectionEmbedding = sectionEmbeddingResponseBody.get('embedding')
        
        titleEscaped = thread.get('thread').get('title').replace("'", "")
        sectionEscaped = section.replace("'", "")
        
        dbCursor.execute(f"""INSERT INTO t_{rootFolderId} (thread, section, length, title, link, text, embeddings)
            VALUES ('{threadId}',{sectionId},{len(section)},'{titleEscaped}','{thread.get('thread').get('link')}','{sectionEscaped}','{sectionEmbedding}');""")
        
        sectionId += 1

In [None]:
#close the database connection
dbCursor.close()
dbClient.close()

In [None]:
threads