In [1]:
import cohere
import pandas as pd
from pinecone import Pinecone
import os

In [2]:
input_path = 'vector_version/GGrand Prairie ISD_Dual Credit Dallas College Grades- Initial Mapping.csv'

In [3]:
from attempt3 import return_df

test = return_df (input_path)

In [4]:
print(test[0:20])

                 Field                                        Description  \
0            PERSON_ID  0           Unique identifier for the student ...   
1              HS_NAME  0           Unique identifier for the student ...   
2                  ISD  0           Unique identifier for the student ...   
3            LAST_NAME  0           Unique identifier for the student ...   
4           FIRST_NAME  0           Unique identifier for the student ...   
5        ADDRESS_LINE1  0           Unique identifier for the student ...   
6        ADDRESS_LINE2  0           Unique identifier for the student ...   
7                 CITY  0           Unique identifier for the student ...   
8             STATE_CD  0           Unique identifier for the student ...   
9                  ZIP  0           Unique identifier for the student ...   
10            BIRTHDAY  0           Unique identifier for the student ...   
11                TERM  0           Unique identifier for the student ...   

In [5]:
ccmr = test[test.isna().any(axis=1)]
print(ccmr)

Empty DataFrame
Columns: [Field, Description, edfi_entity, edfi_attribute]
Index: []


In [6]:
# ccmr = pd.read_csv('vector_version/GGrand Prairie ISD_Dual Credit Dallas College Grades- Initial Mapping.csv')

In [7]:
ccmr.head()

Unnamed: 0,Field,Description,edfi_entity,edfi_attribute


In [8]:
co = cohere.BedrockClient(
    aws_region="us-east-1",
    aws_access_key=os.environ["AWS_ACCESS_KEY_ID"],
    aws_secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
    aws_session_token=os.environ["AWS_SESSION_TOKEN"],
)

In [9]:
def get_cohere_embedding(texts, cohere_client = co, model_id="cohere.embed-english-v3", input_type="search_query", truncate="NONE"):
    """
    Generate embeddings for a list of texts using Cohere BedrockClient.
    Returns a list of embeddings.
    """
    response = cohere_client.embed(
        model=model_id,
        input_type=input_type,
        texts=texts,
        truncate=truncate,
    )
    return response.embeddings

In [10]:
def pinecone_search(index, embedding, top_k=3, include_metadata=True, include_values=False, namespace=None):
    """
    Search Pinecone index with a given embedding.
    Returns the query result.
    """
    return index.query(
        vector=embedding,
        top_k=top_k,
        include_metadata=include_metadata,
        include_values=include_values,
        namespace=namespace
    )

In [11]:
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
index = pc.Index("eduphoric-map") 

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
ccmr["Field"] = ccmr["Field"].fillna("").astype(str)
ccmr["Description"] = ccmr["Description"].fillna("").astype(str)

In [13]:
ccmr['before_vector'] = (
    "Field: " + ccmr['Field'] + "\n" +
    "Description: " + ccmr['Description']
)

In [14]:
# ccmr.iloc[5]['before_vector']

In [15]:
texts = ccmr["before_vector"].tolist()

In [16]:
batch_size = 20
all_embeddings = []

In [17]:
for i in range(0, len(texts), batch_size):
    batch = texts[i:i+batch_size]
    embeddings = get_cohere_embedding(batch, co)
    all_embeddings.extend(embeddings)

ccmr["vector"] = all_embeddings

In [18]:
# ccmr.iloc[8]

In [19]:
def process_pinecone_matches(pinecone_results) -> str:
    """
    Process Pinecone search results to extract relevant information.
    Returns string with field, score, table name, table description, column name, column description, data type for each match.
    """
    result_str = ""
    for match in pinecone_results:
        field = match["field"]
        for pinecone_match in match["pinecone_matches"]:
            score = pinecone_match["score"]
            metadata = pinecone_match["metadata"]
            table_name = metadata.get("table_name", "N/A")
            table_description = metadata.get("table_description", "N/A")
            column_name = metadata.get("column_name", "N/A")
            column_description = metadata.get("column_description", "N/A")
            data_type = metadata.get("data_type", "N/A")

            result_str += (
                f"Field: {field}, Match Score: {score}\n"
                f"Table: {table_name}, Description: {table_description}\n"
                f"Column: {column_name}, Description: {column_description}, Data Type: {data_type}\n\n"
            )
    return result_str

In [20]:
from utils import df_to_markdown_table, parse_llm_response_method, results_to_dataframe

In [21]:
import boto3
runtime = boto3.client("bedrock-runtime")

In [22]:
batch_size = 3

results = []

for i in range(0, len(ccmr), batch_size):
    batch = ccmr.iloc[i:i+batch_size]
    fields = batch["Field"].tolist()
    vectors = batch["vector"].tolist()
    
    pinecone_results = []
    for field, vector in zip(fields, vectors):
        query_result = pinecone_search(index, vector, top_k=5, include_metadata=True)
        pinecone_results.append({
            "field": field,
            "pinecone_matches": query_result["matches"]
        })
    
    findings = process_pinecone_matches(pinecone_results)

    
    prompt = f"""
    You are an edfi data standard v5.2 expert. You will be given a table that has a field name, its description, sample, notes. 
    Your job is to map every field to an edfi attribute and edfi entity with its reasoning.
    You will be given some possible matches from the edfi data standard.
    if you are not confident about the match, leave the field as "n/a"

    Create a output list of dicts with keys "field", "edfi_entity", "edfi_attribute", "reasoning".
    The reasoning should be a short explanation of why you chose the edfi entity and attribute.

    Example output format:
    [
        {{"field": "ACT ID", "edfi_entity": "n/a", "edfi_attribute": n/a"}},
        {{"field": "Last Name", "edfi_entity": "Student", "edfi_attribute": "LastSurname"}},
        {{"field": "First Name", "edfi_entity": "Student", "edfi_attribute": "FirstName"}}
    ]

    Here is the table:
    {df_to_markdown_table(batch)}

    Here are the possible matches:
    {findings}
    """

    response = response = runtime.converse(
        modelId="arn:aws:bedrock:us-east-1:654654390449:application-inference-profile/xjq2nc32nzby",
        messages=[
            {"role": "user", "content": [{"text": prompt}]}
        ]
    )
    parsed_reponse = parse_llm_response_method(response["output"]["message"]["content"][0]["text"])
    results.extend(parsed_reponse)

In [23]:
out = results_to_dataframe(results,ccmr)

In [24]:
# Remove rows with NaN from df
df_no_nan = test.dropna()

# Combine with the cleaned ccmr
final_df = pd.concat([df_no_nan, ccmr], ignore_index=True)

In [25]:
final_df.to_csv('vector_version/ensemble.csv', index=False)