In [1]:
import logging
import ast
import sqlite3
import pandas as pd
import numpy as np

def load_sqlite_db_to_dataframe(db_path:str) -> pd.DataFrame:
    logging.basicConfig(filename='logs.log',
                        level=logging.INFO,
                        format='%(asctime)s:%(levelname)s:%(message)s')
    try:
        conn = sqlite3.connect(db_path)
        logging.info(f'Established SQLite connection with: {db_path}')
        query = "SELECT * FROM computer_organization_and_design_table"
        df = pd.read_sql_query(query, conn)
        df['embeddings'] = df['embeddings'].apply(ast.literal_eval)
        df['embeddings'] = np.stack(df['embeddings'].to_numpy()).tolist()
        logging.info('Database loaded and embeddings converted')
    except Exception as e:
        logging.error(f"Error loading database: {e}")
        raise e
    finally:
        conn.close()
        logging.info('Database connection closed')
    return df

In [7]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarities(df_embeddings, single_embedding):
    # Ensure df_embeddings is a 2D numpy array
    embeddings_array = np.array(df_embeddings.tolist())

    single_embedding_reshaped = np.reshape(single_embedding, (1, -1))
    print(single_embedding_reshaped.shape)

    # Calculate cosine similarity and directly convert to pandas Series.
    similarities = cosine_similarity(embeddings_array, single_embedding_reshaped).flatten()
    return pd.Series(similarities)

In [8]:
from openai import OpenAI

client = OpenAI()

def embed_text(text):
    response = client.embeddings.create(
        input = text,
        model = "text-embedding-3-large"
    )
    print("Embedding created.")
    return response.data[0].embedding

In [9]:
import pandas as pd

def search_by_embeddings(dataframe, user_question, n=5):
    embeded_user_q = embed_text(user_question)
    print("embeded_user_q created.")
    
    similarity_df = calculate_cosine_similarities(dataframe["embeddings"], embeded_user_q)
    print("similarity_df created.")
    
    dataframe['similarities'] = similarity_df
    print("dataframe['similarities'] created.")
    print(dataframe.head(5))
    
    res = dataframe.sort_values('similarities', ascending=False).head(n)
    print(res.head(5))
    return res


In [10]:
import pandas as pd

df = load_sqlite_db_to_dataframe('/Users/jdo/EnBed/EnBed/db/EnBed.sqlite')  # Provide a clear path as an argument
search_query = "How do I design instruction sets for pipelining in MIPS? In this same context, what is the first type of 'hazard'?"
results = search_by_embeddings(df, search_query)
print(results)
print("Done")

Embedding created.
embeded_user_q created.
(1, 3072)
similarity_df created.
dataframe['similarities'] created.
   id                                  guid  \
0   1  a9084a5b-9900-48bb-bc17-69dc6a749d90   
1   2  214be393-5d82-440e-93e0-bda330cd38b7   
2   3  309cedf6-56ae-4f27-bad1-fa8ca679d333   
3   4  b1b7bee4-58e5-47f8-a499-38e9badfc362   
4   5  13ee0133-cd22-4ca8-a554-a1b5f1686290   

                                    author  \
0  David A. Patterson and John L. Hennessy   
1  David A. Patterson and John L. Hennessy   
2  David A. Patterson and John L. Hennessy   
3  David A. Patterson and John L. Hennessy   
4  David A. Patterson and John L. Hennessy   

                                               title  target_page page_range  \
0  Computer Organization and Design: The Hardware...            1      0,1,2   
1  Computer Organization and Design: The Hardware...            2      1,2,3   
2  Computer Organization and Design: The Hardware...            3      2,3,4   
3  Comput

In [13]:
# Assuming 'results' is the DataFrame with the relevant rows already selected
texts = results['text']
print(texts)

for i, text in enumerate(texts, 1):
    authority_info = f"Text {i}: {text}\n"
    print(authority_info)


298    <<<Page 277>>>\n\n4.5 An Overview of Pipelinin...
297    <<<Page 276>>>\nChapter 4 The Processor\nMoreo...
301    <<<Page 280>>>\nChapter 4 The Processor\nThe d...
305    <<<Page 284>>>\nChapter 4 The Processor\nA mor...
299    <<<Page 278>>>\nChapter 4 The Processor\nAs we...
Name: text, dtype: object
Text 1: <<<Page 277>>>

4.5 An Overview of Pipelining 
Pipelining improves performance by increasing instruction throughput, as opposed to decreasing the execution time of an individual instruction, but instruction throughput is the important metric because real programs execute billions of instructions.

Designing Instruction Sets for Pipelining
Even with this simple explanation of pipelining, we can get insight into the design of the MIPS instruction set, which was designed for pipelined execution. First, all MIPS instructions are the same length. This restriction makes it much easier to fetch instructions in the first pipeline stage and to decode them in the second stage. In an

In [14]:
import os
import openai


client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
def question_master(question, augmented_knowledge):
    system_prompt_static = """You are a helpful assistant that answers questions in computer science and software engineering. 
    You always think step-by-step and think for a moment before answering.
    You have the following tasks:
    1. Process and understand the information contained in the 'augmented_knowledge' variable.
        The 'augmented_knowledge' variable is enclosed between the XML tags "<augmented_knowledge>" and "</augmented_knowledge>", 
        and contains the most relevant information from the course textbook.
    2. Consider information derived from the 'augmented_knowledge' variable as completely true and fully accurate.
    3. Read the user's question.
    4. Take the information derived from the 'augmented_knwoledge' and use it to answer the user's question.
    5. Answer the user's question to the best of your ability, unless you couldn't find the answer in 'augmented_knowledge' 
        in which case say "I couldn't find it, would you like me to look it up?".
    Deliverables: 
    1. The most important information that you found in the 'augmented_knowledge' variable.
    2. The answer to the user's question, which may include generating code or a detailed explanation. Many times, it will include answering a multiple choice question.
    """;
    system_prompt_dynamic = "Here is the most relevant information drawn from the course textbook: \n\n" + augmented_knowledge
    system_prompt = system_prompt_static + system_prompt_dynamic
    response = client.chat.completions.create(
        # model="gpt-4",
        model="gpt-4-1106-preview",
        # model="gpt-3.5-turbo-0125",
        temperature=0,
        messages=[
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": question
            }
        ]
    )
    string_response = str(response.choices[0].message.content)
    return string_response


In [15]:
user_question = f'{search_query}'

In [17]:
answer = question_master(user_question, authority_info)
print(answer)

### Most Important Information from 'augmented_knowledge':

1. **Branch Prediction**: A method of resolving a branch hazard that assumes a given outcome for the branch and proceeds from that assumption rather than waiting to ascertain the actual outcome. Dynamic branch predictors use history to predict future behavior and can achieve over 90% accuracy.

2. **Pipelining**: A technique that exploits parallelism among the instructions in a sequential instruction stream, fundamentally invisible to the programmer. It introduces challenges such as hazards that must be managed to maintain performance.

3. **Delayed Branch**: A solution to control hazards used by the MIPS architecture, where the next sequential instruction is executed, and the branch takes place after a one-instruction delay. This is hidden from the programmer by the assembler rearranging instructions.

4. **Hazards in Pipelining**: There are three types of hazards that can occur in pipelining: structural, data, and control ha

In [ ]:
print(answer)