In [2]:
import pandas as pd
import psycopg2
from dotenv import load_dotenv
import os
import boto3

In [2]:
load_dotenv()

True

In [3]:
QUERY = """
SELECT
  c.table_name,
  obj_description(cls.oid) AS table_comment,
  c.column_name,
  pgd.description AS column_comment,
  c.data_type
FROM
  information_schema.columns c
JOIN
  pg_catalog.pg_class cls ON cls.relname = c.table_name
    AND cls.relkind = 'r'
LEFT JOIN
  pg_catalog.pg_description pgd ON pgd.objoid = cls.oid AND pgd.objsubid = c.ordinal_position
WHERE
  c.table_schema NOT IN ('pg_catalog', 'information_schema')
ORDER BY
  c.table_schema,
  c.table_name,
  c.ordinal_position;
"""

In [4]:
conn = psycopg2.connect(
            host=os.environ["DB_HOST"],
            port=os.environ["DB_PORT"],
            dbname=os.environ["DB_NAME"],
            user=os.environ["DB_USER"],
            password=os.environ["DB_PASS"]
        )

In [5]:
df = pd.read_sql_query(QUERY, conn)

  df = pd.read_sql_query(QUERY, conn)


In [6]:
df.head()

Unnamed: 0,table_name,table_comment,column_name,column_comment,data_type
0,absenceeventcategorydescriptor,This descriptor describes the type of absence,absenceeventcategorydescriptorid,"A unique identifier used as Primary Key, not d...",integer
1,academichonorcategorydescriptor,A designation of the type of academic distinct...,academichonorcategorydescriptorid,"A unique identifier used as Primary Key, not d...",integer
2,academicsubjectdescriptor,This descriptor holds the description of the c...,academicsubjectdescriptorid,"A unique identifier used as Primary Key, not d...",integer
3,academicweek,This entity represents the academic weeks for ...,schoolid,The identifier assigned to a school. It must b...,bigint
4,academicweek,This entity represents the academic weeks for ...,weekidentifier,The school label for the week.,character varying


In [7]:
len(df)

4637

In [37]:
df["table_name"] = df["table_name"].fillna("").astype(str)
df["table_comment"] = df["table_comment"].fillna("").astype(str)
df["column_name"] = df["column_name"].fillna("").astype(str)
df["column_comment"] = df["column_comment"].fillna("").astype(str)

In [38]:
df['before_vector'] = (
    "table_name: " + df['table_name'] + "\n" +
    "table_description: " + df['table_comment'] + "\n" +
    "column_name: " + df['column_name'] + "\n" +
    "column_description: " + df['column_comment']
)

In [39]:
print(df.iloc[8]["before_vector"])

table_name: academicweek
table_description: This entity represents the academic weeks for a school year, optionally captured to support analyses.
column_name: discriminator
column_description: 


In [17]:
import cohere

co = cohere.BedrockClient(
    aws_region="us-east-1",
    aws_access_key=os.environ["AWS_ACCESS_KEY_ID"],
    aws_secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
    aws_session_token=os.environ["AWS_SESSION_TOKEN"],
)

model_id = "cohere.embed-english-v3"
input_type = "search_document"     
truncate = "NONE"

In [41]:
texts = df["before_vector"].tolist() # turn texts into a list

In [28]:
import time

In [45]:
batch_size = 20
all_embeddings = []

for i in range(0, len(texts), batch_size):
    batch = texts[i:i+batch_size]
    result = co.embed(
        model=model_id,
        input_type=input_type,
        texts=batch,
        truncate=truncate,
    )
    all_embeddings.extend(result.embeddings)
    time.sleep(0.5)


In [46]:
df["vector"] = all_embeddings

In [47]:
df["id"] = df.index.astype(str)

In [48]:
metadata_columns = ["table_name", "table_comment", "column_name", "column_comment", "data_type"]
df["metadata"] = df[metadata_columns].to_dict(orient="records")

In [49]:
print(df.iloc[8]["metadata"])

{'table_name': 'academicweek', 'table_comment': 'This entity represents the academic weeks for a school year, optionally captured to support analyses.', 'column_name': 'discriminator', 'column_comment': '', 'data_type': 'character varying'}


In [50]:
vectors = [
    {
        "id": row["id"],
        "values": row["vector"],
        "metadata": row["metadata"]
    }
    for _, row in df.iterrows()
]

In [54]:
len(vectors[1]["values"])

1024

In [3]:
from pinecone import Pinecone

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
index = pc.Index("eduphoric-map")  

  from .autonotebook import tqdm as notebook_tqdm


In [57]:
import itertools

def chunks(iterable, batch_size=100):  
    it = iter(iterable)
    chunk = list(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = list(itertools.islice(it, batch_size))

for vectors_chunk in chunks(vectors, batch_size=100):  
    index.upsert(vectors=vectors_chunk)