<a href="https://colab.research.google.com/github/ipeirotis/sql_autograding/blob/main/preprocessing_ft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q openai
!pip install -q google-cloud-secret-manager
!pip install -U PyMySQL sqlalchemy

In [None]:
import gcsfs
import pandas as pd
import io
import openai
import os

from google.cloud import secretmanager

from google.colab import auth

import requests

from sqlalchemy import create_engine

from sklearn.model_selection import train_test_split

In [None]:
# Login using the account that has access to the Google project
# in order to access the resources for the project
auth.authenticate_user()

In [None]:

def access_secret_version(project_id, secret_id, version_id):
    """
    Access the payload of the given secret version and return it.

    Args:
        project_id (str): Google Cloud project ID.
        secret_id (str): ID of the secret to access.
        version_id (str): ID of the version to access.
    Returns:
        str: The secret version's payload, or None if
        the version does not exist.
    """
    try:
        client = secretmanager.SecretManagerServiceClient()
        name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}"
        response = client.access_secret_version(request={"name": name})
        return response.payload.data.decode('UTF-8')
    except Exception as e:
        print(f"Failed to access the secret version: {e}")
        return None

openai_key = access_secret_version("sql-autograding", "openai-gpt4-32k", "3")
openai.api_key = openai_key


In [None]:
URL = "https://api.openai.com/v1/chat/completions"

In [None]:
grading = {
    "model": "gpt-4-32k",
    "messages": [{"role": "user", "content": f"question: There are two relationships between Customer and Plan. Explain how they differ. \n Key: The Responsible For relationship is an overall 1:M relationship between Customer and Plan. A Customer can be responsible for 0, 1, or many Plans yet any one Plan will be linked to only 1 Customer for responsibility purposes. The Belongs relationship is an overall M:M relationship that permits the linking of multiple customers to a single plan, as in the case of family members being part of a particular plan or different plans. Student answer: Each customer can have 0 to many plans. Each plan must have one responsible party, but may belong to more than one customer. Grade on student answer based on the question and answer key."}],
    "max_tokens": 256,
    "temperature": 0
}

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {openai_key}"}

response = requests.post(URL, headers=headers, json=grading, stream=False)
if response.status_code != 200:
    print(f"Request to OpenAI failed with status {response.status_code}, response: {response.content}")


In [None]:
print(response.content)
# print(response.choices.message.content)

In [None]:

fs = gcsfs.GCSFileSystem(project='sql_autograding')
with fs.open('gs://sql_autograding/quiz_responses.csv') as f:
    data = pd.read_csv(f)


In [None]:

data.head()

In [None]:
# remove columns
df = data.drop(columns=['QuestionAnswerTime'])
df = df[df['QuestionType']=='Long Answer Question']
df.head()

In [None]:


train_df, test_df = train_test_split(df, test_size=0.2, random_state=1234)
len(train_df), len(test_df)

In [None]:
# Each quizid is associated with a specific database. Music, Flights, Northwind, Facebook
# It will need a bit of manual work to assign each quizid to a database,
# but it will not take long (the questions are the same across quizzes offered for different sessions).

## Create fine-tuning dataset for the model

In [None]:
# assign each quiz with its database in df
  # Module 4 Practice: Flights Database Questions - flights
  # Restaurants Database: Simple Practice Queries - restaurants

def map_db(quiz):
  if 'Assignment 6' in quiz or 'Module 4 Practice: Flights' in quiz:
    return 'flights'
  elif 'Assignment 5' in quiz:
    return 'imdb' #?
  elif 'Assignment 4' in quiz or 'Assignment 3' in quiz or 'Assignment 2' in quiz:
    return 'music'
  elif 'Module 4' in quiz or 'Module 3' in quiz or 'Restaurants Database' in quiz:
    return 'restaurants'
  elif 'Module 2' in quiz:
    return ['facebook', 'restaurants']
  elif 'Module 1' in quiz:
    return 'northwind'
  elif 'General Data Analytics Practice' in quiz:
    return 'collisions'
  elif 'Final Exam' in quiz:
    return ['northwind', 'flights']
  else:
    return None

df['Database'] = df['QuizName'].apply(map_db)

In [None]:
df.head(20)

### Connect to database

In [None]:

conn_string = 'mysql+pymysql://{user}:{password}@{host}/?charset=utf8'.format(
    host = 'db.ipeirotis.org',
    user = 'student',
    password = 'dwdstudent2015',
    encoding = 'utf-8')
engine = create_engine(conn_string)

### Prompt and Message

Prompt:
In database XXX, we ask the question: “what is the average fare of the flights departing from each airport”?
The student answered “SELECT …. FROM ….”
The correct answer was “SELECT …. FROM ….”

Message:
The TA assigned the grade “8/10” points
The feedback from the TA was “....”


In [None]:
df['prompt'] = "In database " + df['Database'].apply(lambda x: ' '.join(x)) + ", we ask the question:\n\n " + df['Question'] + "\n\n The studednt answered:\n " + df['InputUserAnswer'] + "\n\n The correct answer was: " + df['AnswerKey']
df['message'] = "The TA assigned the grade {s} points. The feedback from the TA was {feedback}"

In [None]:
df

In [None]:
# for i, row in df.iterrows():
#   db = row.Database
#   q = row.Question
#   ans = row.InputUserAnswer
#   key = row.AnswerKey
#   prompt = f'In database {db}, we ask the question: {q} The studednt answered {ans} The correct answer was {key}'

#   s = row.Score
#   feedback = row.CommentleftonUserResponse
#   # how to get the full score
#   message = f'The TA assigned the grade {s} points. The feedback from the TA was {feedback}'

## Submit the dataset

In [None]:
os.environ['OPENAI_API_KEY'] = openai_key


In [None]:
!openai api fine_tunes.create -t test.jsonl -m ada --suffix "grader_model"

In [None]:
!openai tools fine_tunes.prepare_data -f grader_train.jsonl
!openai tools fine_tunes.prepare_data -f grader_test.jsonl

In [None]:
!openai api fine_tunes.create -t "grader/grader_train.jsonl" -v "grader/grader_test.jsonl" --batch_size 16

## Use the fine-tuning model

## Grade a question

## Evaluation

In [None]:
# We compare the grade assigned by GPT to the grade assigned by the TA.
# We will probably need to examine things critically when there are disagreements, as the difference may be also due to the TA being incorrect.
