In [1]:
from openai import OpenAI
import os

In [2]:
client = OpenAI()
client.api_key = os.environ["OPENAI_API_KEY"]

In [3]:
query = "What are the top 3 stations with the most ridership in Queens during 2024?"

In [4]:
response = client.chat.completions.create(
  model="gpt-4-turbo",
  messages=[
    {
      "role": "system",
      "content": "I have a PostgreSQL database for a subway system with the following schema:\n\n1. stations: Contains details about subway stations.\n    - station_complex_id (TEXT): A unique identifier for each station, primary key.\n    - station_complex (TEXT): The name of the station complex.\n    - borough (TEXT): The city borough where the station is located.\n    - latitude (NUMERIC(17,15)): Latitude coordinate of the station.\n    - longitude (NUMERIC(17,15)): Longitude coordinate of the station.\n\n2. routes: Contains unique subway routes.\n    - route_name (TEXT): The name of the subway route/line, primary key.\n\n3. station_routes: Represents the many-to-many relationship between stations and routes, linking which routes pass through each station.\n    - station_complex_id (TEXT): References stations.station_complex_id.\n    - route_name (TEXT): References routes.route_name.\n    - This table has a composite primary key (station_complex_id, route_name).\n\n4. ridership: Contains data on the number of riders at different times.\n    - entry_id (INT PRIMARY KEY GENERATED ALWAYS AS IDENTITY): Unique identifier for each entry, primary key.\n    - transit_timestamp (TIMESTAMP WITHOUT TIME ZONE): The date and time of the ridership count.\n    - station_complex_id (TEXT): Links to the stations table.\n    - metrocard_fair_fare, metrocard_full_fare, metrocard_other, metrocard_seniors_disability, metrocard_students, metrocard_unlimited_30_day, metrocard_unlimited_7_day, omny_full_fare, omny_other, omny_seniors_disability, total_ridership (INT): Various columns representing different fare types and counts.\n\nGiven this schema, convert the following natural language queries to valid PostgreSQL queries:\n\nExample 1:\nNatural Language: \"What was the station in brooklyn with the least ridership on march 3rd 2024?\"\nExpected SQL Query: SELECT s.station_complex, SUM(r.total_ridership) AS total_ridership\nFROM stations s\nJOIN ridership r ON s.station_complex_id = r.station_complex_id\nWHERE s.borough = 'Brooklyn' \n      AND r.transit_timestamp >= '2024-03-03 00:00:00' \n      AND r.transit_timestamp < '2024-03-04 00:00:00'\nGROUP BY s.station_complex\nORDER BY total_ridership ASC\nLIMIT 1;\n\nExample 2:\nNatural Language: \"What are the train lines that stop at 14 St-Union Sq?\"\nExpected SQL Query: SELECT sr.route_name\nFROM station_routes sr\nJOIN stations s ON sr.station_complex_id = s.station_complex_id\nWHERE s.station_complex ILIKE '%14 St-Union Sq%';\n\nExample 3:\nNatural Language: \"Find the day in 2024 with the most ridership across all stations\"\nExpected SQL Query: SELECT DATE(transit_timestamp) AS day, SUM(total_ridership) AS total_ridership\nFROM ridership\nWHERE DATE(transit_timestamp) BETWEEN '2024-01-01' AND '2024-12-31'\nGROUP BY DATE(transit_timestamp)\nORDER BY total_ridership DESC\nLIMIT 1;\n\n\nYour task is to translate natural language queries given by the user into valid PostgresQL queries based on the provided schema and examples given. Note that users may call subway routes \"lines\" or \"subway lines\" interchangeably. Return ONLY the SQL query.\n"
    },
    {
      "role": "user",
      "content": query
    }
  ],
  temperature=1,
  max_tokens=256,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0
)

In [15]:
response.choices[0].message.content.replace("```sql", "").replace("```", "").strip("\n")

"SELECT s.station_complex, SUM(r.total_ridership) AS total_ridership\nFROM stations s\nJOIN ridership r ON s.station_complex_id = r.station_complex_id\nWHERE s.borough = 'Queens' \n      AND DATE(r.transit_timestamp) BETWEEN '2024-01-01' AND '2024-12-31'\nGROUP BY s.station_complex\nORDER BY total_ridership DESC\nLIMIT 3;"