In [1]:
import pandas as pd
import numpy as np
import os

# remove column restrictions
pd.options.display.max_columns = None

# default float instead of scientific notation
pd.options.display.float_format = '{:,.2f}'.format

In [2]:
data_path = '../data/raw'
file_path = os.path.join(data_path, 'TradeData_8_2_2025_13_44_38.csv')

In [3]:
# read csv
df = pd.read_csv(file_path, encoding="latin-1", engine="python")

### Check completeness

In [4]:
display(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, C to C
Data columns (total 47 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   typeCode                  100000 non-null  object 
 1   freqCode                  100000 non-null  int64  
 2   refPeriodId               100000 non-null  int64  
 3   refYear                   100000 non-null  int64  
 4   refMonth                  100000 non-null  int64  
 5   period                    100000 non-null  int64  
 6   reporterCode              100000 non-null  object 
 7   reporterISO               100000 non-null  object 
 8   reporterDesc              100000 non-null  object 
 9   flowCode                  100000 non-null  object 
 10  flowDesc                  100000 non-null  int64  
 11  partnerCode               100000 non-null  object 
 12  partnerISO                100000 non-null  object 
 13  partnerDesc               100000 non-null  int64  
 14

None

In [5]:
# get missing values
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]

print("Missing Values in Each Column:")
print(missing_values)
print("Missing values percentage: ")
print((missing_values / df.shape[0]) * 100)

Missing Values in Each Column:
qtyUnitCode            100000
altQtyUnitCode         100000
isAltQtyEstimated        1009
isGrossWgtEstimated     24839
cifvalue                23881
isAggregate            100000
dtype: int64
Missing values percentage: 
qtyUnitCode           100.00
altQtyUnitCode        100.00
isAltQtyEstimated       1.01
isGrossWgtEstimated    24.84
cifvalue               23.88
isAggregate           100.00
dtype: float64


In [6]:
# remove rows with at least 50% missing values
threshold = 0.5 * df.shape[1]
df_cleaned = df.dropna(thresh=threshold)

### Numeric summary

In [7]:
display(df.describe().T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
freqCode,100000.0,20194747.6,4987.52,20190101.0,20190101.0,20190101.0,20200101.0,20200101.0
refPeriodId,100000.0,2019.46,0.5,2019.0,2019.0,2019.0,2020.0,2020.0
refYear,100000.0,52.0,0.0,52.0,52.0,52.0,52.0,52.0
refMonth,100000.0,2019.46,0.5,2019.0,2019.0,2019.0,2020.0,2020.0
period,100000.0,407.03,239.12,4.0,203.0,410.0,608.0,894.0
flowDesc,100000.0,429.79,255.0,0.0,208.0,428.0,652.0,899.0
partnerDesc,100000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cmdDesc,100000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
customsDesc,100000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mosCode,100000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Clean column names

In [8]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('-', '_').str.replace(r"[ \-\/]+", "_", regex=True)     .str.replace(r"[^0-9a-z_]", "", regex=True)

### Select relevant columns

In [9]:
df.head()

Unnamed: 0,typecode,freqcode,refperiodid,refyear,refmonth,period,reportercode,reporteriso,reporterdesc,flowcode,flowdesc,partnercode,partneriso,partnerdesc,partner2code,partner2iso,partner2desc,classificationcode,classificationsearchcode,isoriginalclassification,cmdcode,cmddesc,aggrlevel,isleaf,customscode,customsdesc,moscode,motcode,motdesc,qtyunitcode,qtyunitabbr,qty,isqtyestimated,altqtyunitcode,altqtyunitabbr,altqty,isaltqtyestimated,netwgt,isnetwgtestimated,grosswgt,isgrosswgtestimated,cifvalue,fobvalue,primaryvalue,legacyestimationflag,isreported,isaggregate
C,A,20190101,2019,52,2019,4,AFG,Afghanistan,M,Import,0,W00,World,0,W00,World,H4,HS,True,TOTAL,All Commodities,0,False,C00,TOTAL CPC,0,0,TOTAL MOT,-1,,0,False,-1,,0,False,0.0,True,0,False,8568013876.87,0.0,8568013876.87,4,False,True,
C,A,20190101,2019,52,2019,4,AFG,Afghanistan,M,Import,16,ASM,American Samoa,0,W00,World,H4,HS,True,TOTAL,All Commodities,0,False,C00,TOTAL CPC,0,0,TOTAL MOT,-1,,0,False,-1,,0,False,0.0,True,0,False,614220.56,0.0,614220.56,4,False,True,
C,A,20190101,2019,52,2019,4,AFG,Afghanistan,M,Import,20,AND,Andorra,0,W00,World,H4,HS,True,TOTAL,All Commodities,0,False,C00,TOTAL CPC,0,0,TOTAL MOT,-1,,0,False,-1,,0,False,0.0,True,0,False,122809.39,0.0,122809.39,4,False,True,
C,A,20190101,2019,52,2019,4,AFG,Afghanistan,M,Import,31,AZE,Azerbaijan,0,W00,World,H4,HS,True,TOTAL,All Commodities,0,False,C00,TOTAL CPC,0,0,TOTAL MOT,-1,,0,False,-1,,0,False,0.0,True,0,False,48473684.35,0.0,48473684.35,4,False,True,
C,A,20190101,2019,52,2019,4,AFG,Afghanistan,M,Import,32,ARG,Argentina,0,W00,World,H4,HS,True,TOTAL,All Commodities,0,False,C00,TOTAL CPC,0,0,TOTAL MOT,-1,,0,False,-1,,0,False,0.0,False,0,False,257396.17,0.0,257396.17,0,False,True,


In [10]:
print(df.columns)

Index(['typecode', 'freqcode', 'refperiodid', 'refyear', 'refmonth', 'period',
       'reportercode', 'reporteriso', 'reporterdesc', 'flowcode', 'flowdesc',
       'partnercode', 'partneriso', 'partnerdesc', 'partner2code',
       'partner2iso', 'partner2desc', 'classificationcode',
       'classificationsearchcode', 'isoriginalclassification', 'cmdcode',
       'cmddesc', 'aggrlevel', 'isleaf', 'customscode', 'customsdesc',
       'moscode', 'motcode', 'motdesc', 'qtyunitcode', 'qtyunitabbr', 'qty',
       'isqtyestimated', 'altqtyunitcode', 'altqtyunitabbr', 'altqty',
       'isaltqtyestimated', 'netwgt', 'isnetwgtestimated', 'grosswgt',
       'isgrosswgtestimated', 'cifvalue', 'fobvalue', 'primaryvalue',
       'legacyestimationflag', 'isreported', 'isaggregate'],
      dtype='object')


### Uninformative columns

In [11]:
# check for columns with single unique values (constant columns)
constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
print("Constant Columns:")
print(constant_cols)

Constant Columns:
['typecode', 'refyear', 'partnerdesc', 'partner2code', 'partner2iso', 'classificationcode', 'classificationsearchcode', 'isoriginalclassification', 'cmdcode', 'cmddesc', 'aggrlevel', 'isleaf', 'customscode', 'customsdesc', 'moscode', 'motcode', 'motdesc', 'qtyunitcode', 'qtyunitabbr', 'qty', 'isqtyestimated', 'altqtyunitcode', 'altqtyunitabbr', 'altqty', 'isaltqtyestimated', 'isnetwgtestimated', 'grosswgt', 'legacyestimationflag', 'isreported', 'isaggregate']


In [12]:
# explore if these really have no information
for col in constant_cols:
    print(f"{col}: {df[col].unique()}")

typecode: ['A']
refyear: [52]
partnerdesc: [0]
partner2code: ['W00']
partner2iso: ['World']
classificationcode: ['HS']
classificationsearchcode: [ True]
isoriginalclassification: ['TOTAL']
cmdcode: ['All Commodities']
cmddesc: [0]
aggrlevel: [False]
isleaf: ['C00']
customscode: ['TOTAL CPC']
customsdesc: [0]
moscode: [0]
motcode: ['TOTAL MOT']
motdesc: [-1]
qtyunitcode: [nan]
qtyunitabbr: [0]
qty: [False]
isqtyestimated: [-1]
altqtyunitcode: [nan]
altqtyunitabbr: [0]
altqty: [False]
isaltqtyestimated: [ 0. nan]
isnetwgtestimated: [0]
grosswgt: [False]
legacyestimationflag: [False]
isreported: [ True]
isaggregate: [nan]


In [13]:
# drop constant columns
print("Original DataFrame shape:", df.shape)
df = df.drop(columns=constant_cols)
print("New DataFrame shape:", df.shape)

Original DataFrame shape: (100000, 47)
New DataFrame shape: (100000, 17)


In [14]:
# check columns with very low variance
numeric_cols = df.select_dtypes(include=[np.number]).columns
low_variance_cols = []
for col in numeric_cols:
    if df[col].std() < 0.01:  
        low_variance_cols.append(col)
print("Low Variance Columns:")
print(low_variance_cols)

Low Variance Columns:
[]


In [15]:
# columns with high cardinality
high_cardinality = df.nunique().sort_values(ascending=False)
print("Cardinality assorted columns:")
high_cardinality = high_cardinality[high_cardinality > 10]  # filter for columns with more than 10 unique values
print(high_cardinality)

Cardinality assorted columns:
fobvalue               98922
cifvalue               54243
isgrosswgtestimated    53227
flowdesc                 246
partneriso               246
partnercode              245
period                   172
reportercode             172
reporteriso              172
dtype: int64


In [16]:
# looks okay
df[high_cardinality.index]

Unnamed: 0,fobvalue,cifvalue,isgrosswgtestimated,flowdesc,partneriso,partnercode,period,reportercode,reporteriso
C,8568013876.87,0.00,8568013876.87,0,World,W00,4,AFG,Afghanistan
C,614220.56,0.00,614220.56,16,American Samoa,ASM,4,AFG,Afghanistan
C,122809.39,0.00,122809.39,20,Andorra,AND,4,AFG,Afghanistan
C,48473684.35,0.00,48473684.35,31,Azerbaijan,AZE,4,AFG,Afghanistan
C,257396.17,0.00,257396.17,32,Argentina,ARG,4,AFG,Afghanistan
...,...,...,...,...,...,...,...,...,...
C,30077495.20,30077495.20,0.00,854,Burkina Faso,BFA,752,SWE,Sweden
C,48279492.01,48279492.01,0.00,858,Uruguay,URY,752,SWE,Sweden
C,33850224.55,33850224.55,0.00,860,Uzbekistan,UZB,752,SWE,Sweden
C,1604180.85,1604180.85,0.00,862,Venezuela,VEN,752,SWE,Sweden


### Missing values

In [17]:
# get missing percentage
missing_percentage = (df.isnull().sum() / df.shape[0]) * 100
print("Missing Values Percentage:")
print(missing_percentage[missing_percentage > 0])

Missing Values Percentage:
isgrosswgtestimated   24.84
cifvalue              23.88
dtype: float64


In [18]:
df[missing_percentage[missing_percentage > 0].index].head()

Unnamed: 0,isgrosswgtestimated,cifvalue
C,8568013876.87,0.0
C,614220.56,0.0
C,122809.39,0.0
C,48473684.35,0.0
C,257396.17,0.0


In [19]:
# 54% missing in cif value
df[df['cifvalue'] > 0].shape[0]/ df.shape[0] * 100

54.659

### Identify text or categorical columns

In [20]:
# idetnify text or categorical columns
text_cols = df.select_dtypes(include=['object']).columns
print("Text or Categorical Columns:")
print(text_cols)

Text or Categorical Columns:
Index(['reportercode', 'reporteriso', 'reporterdesc', 'flowcode',
       'partnercode', 'partneriso', 'partner2desc'],
      dtype='object')


In [21]:
# these are just one word or very short text. Not really good for NLP but still good for now.
df[text_cols].head()

Unnamed: 0,reportercode,reporteriso,reporterdesc,flowcode,partnercode,partneriso,partner2desc
C,AFG,Afghanistan,M,Import,W00,World,H4
C,AFG,Afghanistan,M,Import,ASM,American Samoa,H4
C,AFG,Afghanistan,M,Import,AND,Andorra,H4
C,AFG,Afghanistan,M,Import,AZE,Azerbaijan,H4
C,AFG,Afghanistan,M,Import,ARG,Argentina,H4


### Save data

In [22]:
# save to ../data/processed
processed_data_path = '../data/processed'

df.to_csv(os.path.join(processed_data_path, 'trade_data_cleaned.csv'), index=False)

In [23]:
# create a sample data saved in processed_data_path
sample_data = df.sample(n=100, random_state=42)
sample_data.to_csv(os.path.join(processed_data_path, 'trade_data_sample.csv'), index=False)

In [25]:
df = df.drop(
    columns=['freqcode','refmonth','period','reportercode','reporterdesc','partnercode','cifvalue',
             'partner2desc','netwgt','primaryvalue']
    ).rename(columns={
        'reporteriso': 'reporter',
        'partneriso': 'partner',
        'refperiodid': 'year',
        'flowcode': "import_export"
        }
)

In [27]:
def clean_df(df):
    return df.drop(
    columns=['freqcode','refmonth','period','reportercode','reporterdesc','partnercode','cifvalue',
             'partner2desc','netwgt','primaryvalue']
    ).rename(columns={
        'reporteriso': 'reporter',
        'partneriso': 'partner',
        'refperiodid': 'year',
        'flowcode': "import_export"
        }
    )

### Create data dictionary

In [28]:
# prepare data profile

def generate_column_profiles(df):
    profiles = {}
    for col in df.columns:
        profiles[col] = {
            "display_name": col.replace('_', ' ').title(),
            "description": f"Business purpose and meaning of {col}",
            "data_type": str(df[col].dtype),
            "example_values": df[col].dropna().unique().tolist()[:5],
            "missing_percentage": (df[col].isnull().sum() / df.shape[0]) * 100,
            "unique_values": df[col].nunique(),
            "sample_values": df[col].dropna().unique().tolist()[:5],
            "search_relevance": "high" if df[col].nunique() > 10 else "low",
            "rag_importance": "Essential for product queries" if col in ['trade_value_usd', 'cifvalue'] else "Useful for context",
            "relationships": []  # Placeholder for relationships
        }
    return profiles

In [29]:
def create_llm_prompt(column_name, column_profile, dataset_context):
    prompt = f"""
    You are an expert data analyst. Your task is to generate a detailed description for the column '{column_name}' in the dataset.
    
    Column Profile:
    - Display Name: {column_profile['display_name']}
    - Description: {column_profile['description']}
    - Data Type: {column_profile['data_type']}
    - Example Values: {', '.join(map(str, column_profile['example_values']))}
    - Missing Percentage: {column_profile['missing_percentage']}%
    - Unique Values: {column_profile['unique_values']}
    - Sample Values: {', '.join(map(str, column_profile['sample_values']))}
    - Search Relevance: {column_profile['search_relevance']}
    - RAG Importance: {column_profile['rag_importance']}
    
    Please provide:
    1. Business-friendly display name
    2. Clear description of what this column represents
    3. Business purpose and how it's used in trade context
    4. Importance level for search/query applications (High/Medium/Low)
    5. Relationship to other trade data fields

    Format as JSON:
    {{
        "display_name": "Human readable name",
        "description": "What this column represents",
        "business_purpose": "How this is used in trade/logistics",
        "search_importance": "High/Medium/Low",
        "notes": "Additional context or relationships"
    }}
    """
    return prompt

In [30]:
import json
def process_columns_for_dictionary(df, llm_function, batch_size=5):
    
    # generate profile for all columns
    column_profiles = generate_column_profiles(df)

    # process in batches
    columns = list(df.columns)
    data_dictionary = {}
    for i in range(0, len(columns), batch_size):
        batch_cols = columns[i:i+batch_size]
        print(f"Processing columns {i+1} to {i+len(batch_cols)}: {batch_cols}")

        for col in batch_cols:
            try:
                prompt = create_llm_prompt(col, column_profiles[col], "trade data context")

                llm_response = llm_function(prompt)

                # parse response
                column_dict = json.loads(llm_response)
                data_dictionary[col] = column_dict
                print(f"Processed column '{col}': {column_dict}")
            except Exception as e:
                print(f"Error processing column '{col}': {e}")
                data_dictionary[col] = {
                    "display_name": column_profiles[col]['display_name'],
                    "description": "Error processing this column",
                    "business_purpose": "",
                    "search_importance": "Unknown",
                    "notes": str(e)
                }
    return data_dictionary

In [31]:
from openai import OpenAI

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def call_openai_llm(prompt):
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=500,
            temperature=0.5
        )
        # Fix: Use .content instead of ['content']
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error calling OpenAI API: {e}")
        return f"Error: {str(e)}"
 
test_response = call_openai_llm("What is a trade value? Answer in one sentence.")
print("Test response:", test_response)

Test response: A trade value is the estimated worth or price of a particular item or asset in a trade transaction.


In [32]:
def create_data_dictionary(df, batch_size=5):

    print("Generating data dictionary...")

    column_profiles = generate_column_profiles(df)

    columns = list(df.columns)
    data_dictionary = {}

    print("Processing columns in batches...")
    for i in range(0, len(columns), batch_size):
        batch_cols = columns[i:i+batch_size]
        print(f"Processing columns {i+1} to {i+len(batch_cols)}: {batch_cols}")

        for col in batch_cols:
            try:
                prompt = create_llm_prompt(col, column_profiles[col], "trade data context")

                # call openAI
                print(f"Calling OpenAI for column '{col}'...")
                llm_response = call_openai_llm(prompt)

                # try to parse the response
                try:
                    column_dict = json.loads(llm_response)
                    data_dictionary[col] = column_dict
                    print(f"Processed column '{col}': {column_dict}")
                except json.JSONDecodeError as e:
                    print(f"Error parsing JSON for column '{col}': {e}")
                    data_dictionary[col] = {
                        "display_name": column_profiles[col]['display_name'],
                        "description": "Error parsing response",
                        "business_purpose": "",
                        "search_importance": "Unknown",
                        "notes": str(e)
                    }
            except Exception as e:
                print(f"Error processing column '{col}': {e}")
                data_dictionary[col] = {
                    "display_name": column_profiles[col]['display_name'],
                    "description": "Error processing this column",
                    "business_purpose": "",
                    "search_importance": "Unknown",
                    "notes": str(e)
                }

    print("Data dictionary generation complete.")
    return data_dictionary

In [33]:
data_dictionary = create_data_dictionary(df, batch_size= 2)

Generating data dictionary...
Processing columns in batches...
Processing columns 1 to 2: ['year', 'reporter']
Calling OpenAI for column 'year'...
Processed column 'year': {'display_name': 'Year', 'description': 'The calendar year in which a trade transaction or event occurred', 'business_purpose': 'The year column provides a chronological reference point for analyzing trends, seasonality, and performance of trade activities over time. It is used to track annual performance, plan for future years, and compare year-over-year growth.', 'search_importance': 'Low', 'notes': 'The year column can be related to other time-related fields such as month, quarter, or date to provide more granular insights into trade data.'}
Calling OpenAI for column 'reporter'...
Processed column 'reporter': {'display_name': 'Reporter', 'description': 'The country or entity that reports the trade data', 'business_purpose': "The 'reporter' column is crucial in trade and logistics as it indicates the country or ent

In [38]:
pd.DataFrame(data_dictionary)

Unnamed: 0,year,reporter,import_export,flowdesc,partner,isgrosswgtestimated,fobvalue
display_name,Year,Reporter,Import Export Status,Flow Description,Partner,Is Gross Weight Estimated,FOB Value
description,The calendar year in which a trade transaction...,The country or entity that reports the trade data,This column indicates whether the transaction ...,The numerical code representing the type of fl...,The country or entity that is a trading partne...,This column indicates whether the gross weight...,The FOB value represents the total value of go...
business_purpose,The year column provides a chronological refer...,The 'reporter' column is crucial in trade and ...,"In trade and logistics, this field helps in tr...",Flowdesc is used to categorize the type of tra...,The 'partner' column identifies the specific c...,"In trade and logistics, knowing whether the gr...",The FOB value is crucial in trade transactions...
search_importance,Low,High,Low,High,High,High,High
notes,The year column can be related to other time-r...,The 'reporter' column may be related to other ...,This field can be related to other trade data ...,Flowdesc may be related to other trade data fi...,The 'partner' column may be related to other t...,This field can be related to other trade data ...,The FOB value is often used in conjunction wit...


In [39]:
# save as csv
pd.DataFrame.from_dict(data_dictionary, orient='index').to_csv("data_dictionary.csv")

### Get final columns

In [40]:
# final columns
final_columns = df.columns.tolist()

### Combine data

In [41]:
raw_data_path = '../data/raw'

# get all files in raw_data_path
raw_files = [f for f in os.listdir(raw_data_path) if os.path.isfile(os.path.join(raw_data_path, f))]
raw_files

['TradeData_8_2_2025_13_44_38.csv',
 'TradeData_8_2_2025_15_26_31.csv',
 'TradeData_8_2_2025_15_21_12.csv',
 'TradeData_8_2_2025_15_27_27.csv']

In [42]:
df_combined = pd.DataFrame()
for file in raw_files:
    file_path = os.path.join(raw_data_path, file)
    print(f"Processing file: {file_path}")
    temp_df = pd.read_csv(file_path, encoding="latin-1", engine="python")
    temp_df.columns = temp_df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('-', '_').str.replace(r"[ \-\/]+", "_", regex=True)     .str.replace(r"[^0-9a-z_]", "", regex=True)
    temp_df = clean_df(temp_df)  # apply cleaning function
    temp_df = temp_df[final_columns]  # ensure only final columns are kept
    print(f"Columns after processing: {temp_df.columns.tolist()}")
    df_combined = pd.concat([df_combined, temp_df], ignore_index=True)

Processing file: ../data/raw/TradeData_8_2_2025_13_44_38.csv
Columns after processing: ['year', 'reporter', 'import_export', 'flowdesc', 'partner', 'isgrosswgtestimated', 'fobvalue']
Processing file: ../data/raw/TradeData_8_2_2025_15_26_31.csv
Columns after processing: ['year', 'reporter', 'import_export', 'flowdesc', 'partner', 'isgrosswgtestimated', 'fobvalue']
Processing file: ../data/raw/TradeData_8_2_2025_15_21_12.csv
Columns after processing: ['year', 'reporter', 'import_export', 'flowdesc', 'partner', 'isgrosswgtestimated', 'fobvalue']
Processing file: ../data/raw/TradeData_8_2_2025_15_27_27.csv
Columns after processing: ['year', 'reporter', 'import_export', 'flowdesc', 'partner', 'isgrosswgtestimated', 'fobvalue']


In [43]:
processed_data_path = '../data/processed'
df_combined.sample(1000).to_csv(os.path.join(processed_data_path, 'trade_data_combined_sample.csv'), index=False)

In [44]:
# save to ../data/processed

df_combined.to_csv(os.path.join(processed_data_path, 'trade_data_combined.csv'), index=False)

In [45]:
df_combined['year'].value_counts()

year
2017    53985
2019    53534
2021    53153
2023    50312
2022    46847
2020    46466
2018    46015
2024    33214
Name: count, dtype: int64

### Save to parquet

In [46]:
df_combined.to_parquet('../data/processed/trade_data_cleaned.parquet')