In [1]:
#import libraries
import pandas as pd
import snowflake.connector
import numpy as np
import openai
import os
import json

# Read openai api key from file apikey.txt
api_key_file = 'apikey.txt'
if os.path.isfile(api_key_file):
    with open(api_key_file) as f:
        openai.api_key = f.readline()
else:
    print(f"Error: {api_key_file} not found.")

OPENAI_API_KEY = openai.api_key
content = ""

#login to snowflake db
con = snowflake.connector.connect(user='vishal.kumar@scale.com',
                                 account='pxa65918',
                                 authenticator='externalbrowser',
                                 warehouse='BOOTCAMP_WH',
                                 database='SCALE_PLAYPEN',
                                 role='SNOWFLAKE_USERS')
cs = con.cursor()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...


In [2]:
#Get table names from the schema
sql = f'''
SHOW TABLES IN SCALE_PLAYPEN.GOLDFINCH_REPLICA;
'''
cs.execute(sql)
rows = cs.fetchall()
column_names = [desc[0] for desc in cs.description]
tdf = pd.DataFrame(rows, columns=column_names)

#copy table names to a list
table_names = []
for i in tdf['name']:
    table_names.append(i)


In [3]:
def get_random_table_name():
    #generate a random table name from the list
    table_name = np.random.choice(table_names)
    return table_name

def get_all_columns(table_name):
    sql = f'''
    SHOW COLUMNS IN SCALE_PLAYPEN.GOLDFINCH_REPLICA.{table_name};
    '''
    cs.execute(sql)
    rows = cs.fetchall()
    column_names = [desc[0] for desc in cs.description]
    cdf = pd.DataFrame(rows, columns=column_names)
    #copy column names to a list
    column_names = []
    for i in cdf['column_name']:
        column_names.append(i)
    return column_names

def question_gpt():
    completion = openai.ChatCompletion.create(
    model="gpt-4-1106-preview",
    messages=[{"role":"user", "content":prompt}]
    )
    #parse the response to get the only the "content" part and save it in response variable
    response = completion["choices"][0]["message"]["content"]
    return response


In [6]:
prompt_start = "You are a top executive in Finance, and tasked with the objective to ask excellent business questions. You are provided with following information,"
prompt_table = "Table Name: " + table_name + "\n"
prompt_columns = "Column Names: " + str(cdf) + "\n"
prompt_break = "Use a JOIN, LEFT JOIN, RIGHT JOIN or INNER JOIN with \n"
difficulty_component = "HAVING" 
prompt_difficulty = "Ask a question such that the SQL query required to answer the question includes a " + difficulty_component + " statement. \n"
prompt_end = "Do not mention any column names or database/sql lingo in the question. Do not mention any additional details, no follow up questions or two part questions. Ask question in one sentence."

In [7]:
#get user input for number of questions to generate and save it in a variable called num_questions
num_questions = int(input("How many questions do you want to generate? "))
num_questions = num_questions/4
#convert num_questions to an integer
num_questions = int(num_questions)
#create empty df called qdf
qdf = pd.DataFrame(columns=['question', 'difficulty_component', 'difficulty', 'table_name', 'column_name'])
#start a for loop for easy questions
for easy in range(1,num_questions):
    difficulty_component = []
    difficulty_component = ['HAVING', 'ORDER BY', 'LIMIT', 'JOIN', 'LIKE', 'OR', 'WHERE', 'GROUP BY']
    #choose random difficulty
    difficulty_component = np.random.choice(difficulty_component)
    difficulty='Easy'
    table_name = get_random_table_name()
    cdf = get_all_columns(table_name)
    prompt = prompt_start + prompt_table + prompt_columns + prompt_difficulty + prompt_end
    question = question_gpt()
    qdf_counter = len(qdf)
    #add question, difficulty_component, "easy", table_name, column_name to qdf (without using append function)
    qdf.loc[qdf_counter] = [question, difficulty_component, difficulty, table_name, column_names]
    print (question)

#start a for loop for medium questions
for medium in range(1,num_questions):
    difficulty_component = []
    difficulty_component = ['HAVING', 'ORDER BY', 'LIMIT', 'JOIN', 'LIKE', 'OR', 'WHERE', 'GROUP BY']
    #choose random difficulty
    difficulty_component = np.random.choice(difficulty_component) + " and " + np.random.choice(difficulty_component)
    difficulty='Medium'
    table_name = get_random_table_name()
    cdf = get_all_columns(table_name)
    prompt = prompt_start + prompt_table + prompt_columns + prompt_break + prompt_table + prompt_columns + prompt_difficulty + prompt_end
    question = question_gpt()
    qdf_counter = len(qdf)
    #add question, difficulty_component, "easy", table_name, column_name to qdf (without using append function)
    qdf.loc[qdf_counter] = [question, difficulty_component, difficulty, table_name, column_names]
    print (question)

#start a for loop for hard questions
for hard in range(1,num_questions):
    difficulty_component = []
    difficulty_component = ['HAVING', 'ORDER BY', 'LIMIT', 'JOIN', 'LIKE', 'OR', 'WHERE', 'GROUP BY', 'SUM', 'COUNT', 'MAX', 'MIN', 'AVG']
    #choose random difficulty
    difficulty_component = np.random.choice(difficulty_component) + " and " + np.random.choice(difficulty_component)
    difficulty='Hard'
    table_name = get_random_table_name()
    cdf = get_all_columns(table_name)
    prompt = prompt_start + prompt_table + prompt_columns + prompt_break + prompt_table + prompt_columns + prompt_difficulty + prompt_end
    question = question_gpt()
    qdf_counter = len(qdf)
    #add question, difficulty_component, "easy", table_name, column_name to qdf (without using append function)
    qdf.loc[qdf_counter] = [question, difficulty_component, difficulty, table_name, column_names]
    print (question)

#start a for loop for extremely hard questions
for ex_hard in range(1,num_questions):
    difficulty_component = []
    difficulty_component = ['HAVING', 'ORDER BY', 'LIMIT', 'JOIN', 'LIKE', 'OR', 'WHERE', 'GROUP BY', 'SUM', 'COUNT', 'MAX', 'MIN', 'AVG']
    #choose random difficulty
    difficulty_component = np.random.choice(difficulty_component) + " and " + np.random.choice(difficulty_component) + " and " + np.random.choice(difficulty_component)
    difficulty='Extremely Hard'
    table_name = get_random_table_name()
    cdf = get_all_columns(table_name)
    prompt = prompt_start + prompt_table + prompt_columns + prompt_break + prompt_table + prompt_columns + prompt_break + prompt_table + prompt_columns + prompt_difficulty + prompt_end
    question = question_gpt()
    qdf_counter = len(qdf)
    #add question, difficulty_component, "easy", table_name, column_name to qdf (without using append function)
    qdf.loc[qdf_counter] = [question, difficulty_component, difficulty, table_name, column_names]
    print (question)

#save qdf to a csv file
qdf.to_csv('Goldfinch_SQL_questions.csv', index=False)

How many sales territories have more than a certain number of producers associated with them as of the most recent reporting period?
Could you provide an updated list of producers who have more than three unique email contacts associated with them as of the most recent date?
How many sales representatives have processed an above-average number of client accounts in their territory as of the most recent update?
Can you identify which producers have a higher than average number of distinct email addresses associated with their accounts as of the latest update?
Can we identify the producers who have a higher than average number of specific email types associated with their records within a certain period?
Which sales territories have more than a certain number of producers whose status has been processed up to a certain date?
How many sales territories have a greater than average number of producers associated primarily with each sales representative's email address?
How many sales territ

In [3]:
pip install python-pptx

Collecting python-pptx
  Downloading python_pptx-0.6.23-py3-none-any.whl.metadata (18 kB)
Downloading python_pptx-0.6.23-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-pptx
Successfully installed python-pptx-0.6.23
[0mNote: you may need to restart the kernel to use updated packages.
