In [None]:
from utils import authenticate
credentials, PROJECT_ID = authenticate() 

REGION = "us-central1"

In [None]:
import vertexai

vertexai.init(project = PROJECT_ID,
              location = REGION,
              credentials = credentials)

Using BigQuery as the data warehouse for this use case, using SQL to retrieve data into the notebook

In [None]:
from google.cloud import bigquery

bq_client = bigquery.Client(project=PROJECT_ID,
                            credentials = credentials)

Using stackoverflow's pubic question answer dataset

In [None]:
QUERY_TABLES = """
SELECT
  table_name
FROM
  `bigquery-public-data.stackoverflow.INFORMATION_SCHEMA.TABLES`
"""

In [None]:
query_job = bq_client.query(QUERY_TABLES)

for row in query_job:
    for value in row.values():
        print(value)

In [None]:
INSPECT_QUERY = """
SELECT
    *
FROM
    `bigquery-public-data.stackoverflow.posts_questions`
LIMIT 3
"""

Using SQL to filter the data and then load it into a dataframe using pandas

In [None]:
import pandas as pd

query_job = bq_client.query(INSPECT_QUERY)

In [None]:
stack_overflow_df = query_job\
    .result()\
    .to_arrow()\
    .to_pandas()
stack_overflow_df.head()

Since we are using a large dataset we optimize our time by concatinating the columns using SQL in the data warehouse it self,instead of loading it on the disk and then doing it

In [None]:
QUERY = """
SELECT
    CONCAT(q.title, q.body) as input_text,
    a.body AS output_text
FROM
    `bigquery-public-data.stackoverflow.posts_questions` q
JOIN
    `bigquery-public-data.stackoverflow.posts_answers` a
ON
    q.accepted_answer_id = a.id
WHERE
    q.accepted_answer_id IS NOT NULL AND
    REGEXP_CONTAINS(q.tags, "python") AND
    a.creation_date >= "2020-01-01"
LIMIT
    10000
"""

In [None]:
query_job = bq_client.query(QUERY)

### this may take some seconds to run
stack_overflow_df = query_job.result()\
                        .to_arrow()\
                        .to_pandas()

stack_overflow_df.head(2)

Adding instructions for the LLM to work, combining this string with the question to use it as a promt to the model

In [None]:
INSTRUCTION_TEMPLATE = f"""\
Please answer the following Stackoverflow question on Python. \
Answer it like you are a developer answering Stackoverflow questions.

Stackoverflow question:
"""

In [None]:
stack_overflow_df['input_text_instruct'] = INSTRUCTION_TEMPLATE + ' '\
    + stack_overflow_df['input_text']

Dataset for training, splitting the dataframe into 2 parts for testing and training 

In [None]:
from sklearn.model_selection import train_test_split

train, evaluation = train_test_split(
    stack_overflow_df,
    ### test_size=0.2 means 20% for evaluation
    ### which then makes train set to be of 80%
    test_size=0.2,
    random_state=42
)

Using the date and time to name the artifact version

In [None]:
import datetime

date = datetime.datetime.now().strftime("%H:%d:%m:%Y")

Exporting the data into jsonl format for training

In [None]:
cols = ['input_text_instruct','output_text']
tune_jsonl = train[cols].to_json(orient="records", lines=True)

training_data_filename = f"tune_data_stack_overflow_\
                            python_qa-{date}.jsonl"

with open(training_data_filename, "w") as f:
    f.write(tune_jsonl)