<a href="https://colab.research.google.com/github/ipeirotis/sql_autograding/blob/main/few_shot_trial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q openai
!pip install -q google-cloud-secret-manager
!pip3 install -U -q PyMySQL sqlalchemy sql_magic

In [None]:
from google.colab import auth

# Login using the account that has access to the Google project
# in order to access the resources for the project
auth.authenticate_user()

In [None]:
from google.cloud import secretmanager


def access_secret_version(project_id, secret_id, version_id):
    """
    Access the payload of the given secret version and return it.

    Args:
        project_id (str): Google Cloud project ID.
        secret_id (str): ID of the secret to access.
        version_id (str): ID of the version to access.
    Returns:
        str: The secret version's payload, or None if
        the version does not exist.
    """
    client = secretmanager.SecretManagerServiceClient()
    name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}"
    response = client.access_secret_version(request={"name": name})
    return response.payload.data.decode("UTF-8")


openai_key = access_secret_version("sql-autograding", "openai-gpt4-32k", "3")

In [None]:
import openai

openai.api_key = openai_key

In [None]:
from google.colab import files
import io
import pandas as pd

In [None]:
import pandas as pd
import gcsfs

fs = gcsfs.GCSFileSystem(project="sql_autograding")
with fs.open("gs://sql_autograding/cleaned_response.csv") as f:
    data = pd.read_csv(f)

In [None]:
data = data.drop(columns="Unnamed: 0")
data.head()

## Table Schemas

In [None]:
from sqlalchemy import create_engine
from sqlalchemy import text

In [None]:
db_list = [
    "flights",
    "imdb",
    "music",
    "restaurants",
    "facebook",
    "northwind",
    "collisions",
]


def db_schema(db_name):
    """
    Connects to a database and returns the schema of each table in the database.

    The function connects to a specific database using SQLAlchemy. It then retrieves the list of tables in
    the database and for each table, it queries the schema (i.e., the list of fields/columns) and stores it
    in a dictionary. The function returns a list of such dictionaries, with each dictionary representing a table
    and its corresponding schema.

    Args:
        db_name (str): The name of the database to connect to and retrieve schemas from.

    Returns:
        list: A list of dictionaries, with each dictionary containing the name of a table as the key and a
              list of its fields as the value.

    Example:
        >>> db_schema('flights')
        {'m_airports': ['airport', 'state', 'state_name']},
        {'m_ticket_prices': ['origin', 'dest', 'carrier', 'fare', 'fare_per_mile','passengers', 'distance']
        ...
        },
    """

    student_password = access_secret_version(
        "sql-autograding", "db_student_password", "1"
    )

    conn_string = (
        "mysql+pymysql://{user}:{password}@{host}/{db}?charset=utf8mb4".format(
            host="db.ipeirotis.org",
            user="student",
            password=student_password,
            db=db_name,
            encoding="utf8mb4",
        )
    )
    engine = create_engine(conn_string)

    with engine.begin() as conn:
        tables = pd.read_sql_query(sql=text("show tables"), con=conn)
        tables = tables.iloc[:, -1].tolist()

        schema_list = []
        for t in tables:
            d = pd.read_sql_query(sql=text(f"describe {t}"), con=conn)
            table_schema = {t: d.loc[:, "Field"].tolist()}
            schema_list.append(table_schema)
    return schema_list

In [None]:
flights_schema = db_schema("flights")
imdb_schema = db_schema("imdb")
music_schema = db_schema("music")
restaurants_schema = db_schema("restaurants")
facebook_schema = db_schema("facebook")
northwind_schema = db_schema("northwind")
collisions_schema = db_schema("collisions")

print(facebook_schema)

In [None]:
schema_mapping = {
    "flights": flights_schema,
    "imdb": imdb_schema,
    "music": music_schema,
    "restaurants": restaurants_schema,
    "facebook": facebook_schema,
    "northwind": northwind_schema,
    "collisions": collisions_schema,
    "['northwind', 'flights']": (northwind_schema, flights_schema),
}

In [None]:
# # print(schema_fun(['northwind', 'flights']))
# import re
# def convert_to_list(string):
#     # Remove the square brackets and extra spaces
#     cleaned_string = re.sub(r'\[|\]', '', string).strip()
#     # Split the string into individual elements
#     elements = [elem.strip() for elem in cleaned_string.split(',')]
#     # Return the converted list
#     return elements

In [None]:
# def schema_fun(db):
#   if isinstance(db, list):
#     # schema_list = []
#     # for name in db:
#     #   schema_list.append(db_schema(name))
#     # return schema_list
#     return [db_schema(name) for name in db]
#   else:
#     return db_schema(db)

In [None]:
data["schema"] = data["Database"].map(schema_mapping)
data.head()

In [None]:
from bs4 import BeautifulSoup

# Create a function to clean up a single text string
def clean_html_content(text):
    if pd.isna(text):
        return text
    if not isinstance(text, str):
        text = str(text)
    try:
        soup = BeautifulSoup(text, "html.parser")
        return soup.get_text()
    except:
        return text


# Apply the function to every text column in your DataFrame
for col in data.columns:
    if col in ("Database", "schema"):
        continue
    if data[col].dtype == object:  # if the column is a text column
        data[col] = data[col].apply(clean_html_content)

In [None]:
# data = data.replace('&nbsp;', ' ', regex=True)
# data = data.replace('&#160;', ' ', regex=True)

## Group questions

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# train_df, test_df = train_test_split(data, test_size=0.2, stratify=data['QuestionId'], random_state=1234)

In [None]:
df = data[data["QuestionId"] == "5,119,721"]
df.head()

In [None]:
qids = df["QuestionId"].unique()
print(qids)

## Mega Prompt

===============================

a. We are working with the XXX database, which has the following tables:
artist(id, first_name, last_name)
album(id, artist_id, name)
track....

b. The request to the student is "Fetch all the tracks for user X"

c. The model answer is ..... (note that we may have multiple correct
answers, the model answer is just an example)

d. The student answer was .....

e. Previously, students have submitted these answers and got back
these responses and grades:

e1. submission: ..... , grade: ..... , feedback

e2. submission: ..... , grade: ..... , feedback

e3. submission: ..... , grade: ..... , feedback

Please provide a grade and feedback for the student

===============================

In [None]:
from typing import List


def generate_submissions_string(
    train_df: pd.DataFrame, submission_size: int = None
) -> str:
    """
    Generate a string that concatenates student submissions up to the given size.

    Args:
        train_df (pd.DataFrame): The DataFrame containing the training data.
        submission_size (int, optional): The number of submissions to include in the string. If None, include all submissions.

    Returns:
        str: A string containing the concatenated submissions.
    """
    submissions = ""
    for i, row in train_df.iterrows():
        if submission_size is not None and i >= submission_size:
            break
        submission = row["InputUserAnswer"]
        grade = row["Score"]
        full_grade = row["full_score"]
        feedback = row["feedback"]
        submissions += f"e{i}. submission: {submission}, grade: {grade}/{full_grade}, feedback: {feedback} \\n\\n"
    return submissions


def generate_mega_prompt_for_id(
    df: pd.DataFrame, id: str, submission_size: int = None
) -> str:
    """
    Generate a mega prompt for a specific QuestionId.

    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        id (str): The QuestionId to generate the mega prompt for.
        submission_size (int, optional): The number of submissions to include in the mega prompt. If None, include all submissions.

    Returns:
        str: The mega prompt.
    """
    temp = df[df["QuestionId"] == id]

    # If there's only one row or less, return None or handle it in a special way
    if len(temp) <= 1:
        return None  # TODO: Check what to return in this scenario

    # making test size 1
    train_df, test_df = train_test_split(temp, test_size=1, random_state=1234)
    train_df = train_df.reset_index()
    test_df = test_df.reset_index()

    submissions = generate_submissions_string(train_df, submission_size)

    db = train_df.loc[0, "Database"]
    tables = train_df.loc[0, "schema"]
    question = train_df.loc[0, "Question"]
    key = train_df.loc[0, "AnswerKey"]

    test = test_df.loc[0, "InputUserAnswer"]

    mega_p = f"""
      a. We are working with the {db} database, which has the following tables: {tables}

      b. The request to the student is "{question}"

      c. The model answer is {key} (note that we may have multiple correct answers, the model answer is just an example)

      d. The student answer was {test}

      e. Previously, students have submitted these answers and got back these responses and grades: \\n{submissions}

      Please provide a grade and feedback for the student
      """
    return mega_p


def generate_mega_prompts(
    df: pd.DataFrame, submission_size: int = None
) -> pd.DataFrame:
    """
    Generate a DataFrame of mega prompts for each unique QuestionId in the given DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        submission_size (int, optional): The number of submissions to include in each mega prompt. If None, include all submissions.

    Returns:
        pd.DataFrame: A DataFrame containing the mega prompts.
    """
    # mega_df = pd.DataFrame(columns=['QuestionId', 'MegaPrompt'])
    result = []
    qids = df["QuestionId"].unique()

    for id in qids:
        mega_p = generate_mega_prompt_for_id(df, id, submission_size)
        new_row = {"QuestionId": id, "MegaPrompt": mega_p}
        result.append(new_row)

    mega_df = pd.DataFrame(result)

    return mega_df

In [None]:
# for all questions each with all submissions => new df
mega_df_all = generate_mega_prompts(data)
mega_df_all.head()

In [None]:
print(mega_df_all.loc[0, "MegaPrompt"])

In [None]:
# select a specific question and control submission size
df = data[data["QuestionId"] == "5,168,443"]

df0 = generate_mega_prompts(df, 0)
df0.head()

In [None]:
df5 = generate_mega_prompts(df, 5)
df10 = generate_mega_prompts(df, 10)
df20 = generate_mega_prompts(df, 20)
df_all = generate_mega_prompts(df, len(df))

## GPT4

In [None]:
def GPT4_generation(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-4-32k",
        messages=[{"role": "user", "content": prompt}],
        n=1,
        stream=False,
        temperature=0.0,
        max_tokens=600,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0
        # stop = ["Q:"]
    )
    return response["choices"][0]["message"]["content"]

In [None]:
print(GPT4_generation(df0.loc[0, "MegaPrompt"]))

In [None]:
print(GPT4_generation(df5.loc[0, "MegaPrompt"]))

In [None]:
print(GPT4_generation(df10.loc[0, "MegaPrompt"]))

In [None]:
print(GPT4_generation(df20.loc[0, "MegaPrompt"]))

In [None]:
print(GPT4_generation(df_all.loc[0, "MegaPrompt"]))

In [None]:
def test_shots(qid, n):
    df = data[data["QuestionId"] == qid]
    df_n = generate_mega_prompts(df, n)
    # df_n.head()
    print(f"{n} submissions:")
    print(GPT4_generation(df_n.loc[0, "MegaPrompt"]))

In [None]:
for i in range(0, 21, 5):
    test_shots("5,168,443", i)

In [None]:
mega_df_all.shape
mega_df_all.head(20)

In [None]:
mega_df_all.shape

In [None]:
from tqdm import tqdm

In [None]:
for i, p in tqdm(mega_df_all.iterrows()):
    # print(i, len(p['MegaPrompt'].split()))
    p["feedback"] = GPT4_generation(p["MegaPrompt"])

mega_df_all.head()

In [None]:
mega_df_all["feedback"] = mega_df_all["MegaPrompt"].apply(GPT4_generation)
mega_df_all.head()