In [2]:
import pandas as pd 
import os
import openai
import json

# remove column restrictions
pd.set_option('display.max_columns', None)

# supress scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Load parquet into df

In [3]:
processed_data_path = '../data/processed/trade_data_cleaned.parquet'

print(f'Loading data from {processed_data_path}')
df = pd.read_parquet(processed_data_path)

print("Columns in the DataFrame:")
print(df.columns.tolist())
print(f'Data loaded with shape: {df.shape}')
display(df.head())

Loading data from ../data/processed/trade_data_cleaned.parquet
Columns in the DataFrame:
['freqcode', 'refperiodid', 'refmonth', 'period', 'reportercode', 'reporteriso', 'reporterdesc', 'flowcode', 'flowdesc', 'partnercode', 'partneriso', 'partner2desc', 'netwgt', 'isgrosswgtestimated', 'cifvalue', 'fobvalue', 'primaryvalue']
Data loaded with shape: (383526, 17)


Unnamed: 0,freqcode,refperiodid,refmonth,period,reportercode,reporteriso,reporterdesc,flowcode,flowdesc,partnercode,partneriso,partner2desc,netwgt,isgrosswgtestimated,cifvalue,fobvalue,primaryvalue
0,20190101,2019,2019,4,AFG,Afghanistan,M,Import,0,W00,World,H4,True,8568013876.87,0.0,8568013876.87,4
1,20190101,2019,2019,4,AFG,Afghanistan,M,Import,16,ASM,American Samoa,H4,True,614220.56,0.0,614220.56,4
2,20190101,2019,2019,4,AFG,Afghanistan,M,Import,20,AND,Andorra,H4,True,122809.39,0.0,122809.39,4
3,20190101,2019,2019,4,AFG,Afghanistan,M,Import,31,AZE,Azerbaijan,H4,True,48473684.35,0.0,48473684.35,4
4,20190101,2019,2019,4,AFG,Afghanistan,M,Import,32,ARG,Argentina,H4,False,257396.17,0.0,257396.17,0


### Define system prompt

In [None]:
SYSTEM_PROMPT = """ 
You are a trade data analyst.
Return ONLY valid JSON with these keys:
filter   – boolean condition in plain English
  groupby  – list of columns
  agg      – {column: aggfunc}
  stats    – {stat_name: [columns]}       # e.g. "corr": ["value","netwgt"], or "describe": ["value"]
  sort     – optional {by: col, ascending: bool}
  top_n    – optional int
  chart    – "bar" | "line" | null

  Columns you may reference:
  shipper, consignee, description,
  value, period, reporterdesc, partnerdesc.
"""


In [None]:
from openai import OpenAI
client = OpenAI()

question = "What is the total value of shipments by shipper in 2023?"

resp = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": question}
    ],
    temperature=0.0,
    )

plan_text = resp.choices[0].message.content
print(f"Plan text: {plan_text}")

# parse to dict
plan = json.loads(plan_text)
print(f"Plan dict: {plan}")

In [None]:
question = "Show me the correlation between value and netwgt for 2022."
resp = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": question}
    ],
    temperature=0.0,
)
plan_text = resp.choices[0].message.content
print(f"Plan text: {plan_text}")

# parse to dict
plan = json.loads(plan_text)
print(f"Plan dict: {plan}")

### Test utils

In [1]:
from app.utils import answer

question = "Which country has the highest total value of shipments in 2023?"
out = answer(question= question)


LLM response: {
  "filter": "period == 2023",
  "groupby": ["reporterdesc"],
  "agg": {"value": "sum"},
  "sort": {"by": "value", "ascending": false},
  "top_n": 1,
  "chart": null
}


TypeError: dtype 'period    int64
period    int64
dtype: object' not understood

In [None]:
out['df']

Unnamed: 0,reporterdesc,value
