In [2]:
# 0) install once (if needed)
# Install SQLAlchemy (open-source SQL toolkit and Object-Relational Mapping (ORM) library for Python)
%pip install sqlalchemy
# Install PostgreSQL driver 
%pip install psycopg2
# Install add-on package for SQLAlchemy
%pip install sqlalchemy_utils

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import requests
import re  # for regex
import os
from dotenv import load_dotenv
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mtick
import time
import os
import sqlalchemy as db
from sqlalchemy_utils import create_database
from sqlalchemy import text
import pandas as pd
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Boolean, ForeignKey
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session

load_dotenv()
api_key = os.getenv("API_KEY")
datasetid = "d_3f960c10fed6145404ca7b821f263b87"

In [6]:
def fetch_dataset(datasetid, api_key, limit=10000):
    url = f"https://api-production.data.gov.sg/v2/public/api/datasets/{datasetid}/list-rows"
    headers = {"x-api-key": api_key}
    
    all_rows = []
    offset = 0
    
    while True:
        params = {"limit": limit, "offset": offset}
        response = requests.get(url, headers=headers, params=params)
        
        if response.status_code != 200:
            print(f"Error: {response.status_code}, {response.text}")
            break
        
        json_data = response.json()
        rows = json_data["data"]["rows"]
        
        if not rows:  # stop if no more data
            break
        
        # Extract inner row dicts
        records = [r.get("row", r) for r in rows]
        all_rows.extend(records)
        
        print(f"Fetched {len(records)} rows (offset={offset})")
        
        # Prepare for next batch
        offset += limit
    
    return pd.DataFrame(all_rows)


df = fetch_dataset(datasetid, api_key)
print("Total rows fetched:", len(df))
print("Columns:", df.columns.tolist())
print(df.head())

Fetched 10000 rows (offset=0)
Fetched 10000 rows (offset=10000)
Fetched 10000 rows (offset=20000)
Fetched 10000 rows (offset=30000)
Fetched 10000 rows (offset=40000)
Fetched 10000 rows (offset=50000)
Fetched 10000 rows (offset=60000)
Fetched 10000 rows (offset=70000)
Fetched 10000 rows (offset=80000)
Fetched 10000 rows (offset=90000)
Fetched 10000 rows (offset=100000)
Fetched 10000 rows (offset=110000)
Fetched 10000 rows (offset=120000)
Fetched 10000 rows (offset=130000)
Fetched 10000 rows (offset=140000)
Fetched 10000 rows (offset=150000)
Fetched 10000 rows (offset=160000)
Fetched 10000 rows (offset=170000)
Fetched 10000 rows (offset=180000)
Fetched 10000 rows (offset=190000)
Fetched 10000 rows (offset=200000)
Fetched 10000 rows (offset=210000)
Fetched 10000 rows (offset=220000)
Fetched 10000 rows (offset=230000)
Fetched 10000 rows (offset=240000)
Fetched 10000 rows (offset=250000)
Fetched 10000 rows (offset=260000)
Fetched 10000 rows (offset=270000)
Fetched 10000 rows (offset=280000)

In [7]:
df.to_csv(os.path.join(os.getcwd(), "acraEntities.csv"), index=False) 

In [16]:
df = pd.read_csv(os.path.join(os.getcwd(), "acraEntities.csv"))
df.columns.tolist()

['vault_id',
 'uen',
 'issuance_agency_desc',
 'uen_status_desc',
 'entity_name',
 'entity_type_desc',
 'uen_issue_date',
 'reg_street_name',
 'reg_postal_code']

In [17]:
df.isna().sum()

vault_id                0
uen                     0
issuance_agency_desc    0
uen_status_desc         0
entity_name             1
entity_type_desc        0
uen_issue_date          0
reg_street_name         0
reg_postal_code         0
dtype: int64

In [18]:
print(df['uen'].nunique())
print(df['entity_name'].nunique())

1674891
1655655


In [19]:
# 1) Remove leading/trailing spaces make everything uppercase for consistency
df['entity_name'] = df['entity_name'].str.strip()
# 2) Collapse double/triple spaces 

# Identify rows with double spaces in entity_name column values
mask = df['entity_name'].str.contains(r'\s{2,}', na=False) #2 or more double spaces, na=False ignore NaN values if any
df[mask].head(10)
print("Rows with double spaces:", mask.sum()) #if 0, means it's clean. otherwise, it means we have supplier names with double spaces

# show distinct supplier names that have double spaces 
df.loc[mask, 'entity_name'].unique() 

# ensure supplier names are normalized. so no duplicates due to space issues
df['entity_name'] = df['entity_name'].str.replace(r'\s+', ' ', regex=True) 
print("Rows with double spaces:", mask.sum()) #if 0, means it's clean. 

# 3) Standardize PTE LTD variants. 
# Note: Not ideal to strip PTE LTD as we would risk collisions. E.g. ABC PTE LTD (company) vs ABC LLP (partnership) run by diff owners

# Normalizing common suffixes using regex (pattern matching)
df['entity_name'] = df['entity_name'].str.replace(r'PTE\.?', 'PTE', regex=True)
df['entity_name'] = df['entity_name'].str.replace(r'LTD\.?', 'LTD', regex=True)

# Handling rare 'PTE LIMITED' 
df['entity_name'] = df['entity_name'].str.replace(r'PTE LIMITED', 'PTE LTD', regex=True)

# Remove trailing periods. Some suppliers end with a dot
df['entity_name'] = df['entity_name'].str.replace(r'\.\s*$', '', regex=True)

# Add upper  case for consistency in SQL group buys
df['entity_name'] = df['entity_name'].str.upper()

# If award_amt = 0, entity_name = "Unknown", set entity_name to NULL in PostgreSQL

print("Sample suppliers:\n", df['entity_name'].drop_duplicates().sample(10)) #to check if cleaning works as expected

Rows with double spaces: 4646
Rows with double spaces: 4646
Sample suppliers:
 380783                            SANDSTONE CAPITAL PTE LTD
782757                                  KALLANG0808 PTE LTD
1523706                       MULTI FOOD INDUSTRIES PTE LTD
1131541                                        KIA MUI & CO
1482656                                   TEAM TREE PTE LTD
515488                SEAHORSE INTERNATIONAL AGENCY PTE LTD
448258                                MY LITTLE SWEET TOOTH
842964                            PRYMO CONSULTANCY PTE LTD
329417                                               STYLEQ
1573058    COLOMBO RESTAURANT AUTHENTIC SHRI LANKAN CUISINE
Name: entity_name, dtype: object


In [20]:
def clean_acra_data(df: pd.DataFrame):
    # --- Drop internal columns ---
    df = df.drop(columns=['_id', '__dataset_id'], errors='ignore')

    # --- Normalize blanks to NaN ---
    df = df.replace(r'^\s*$', pd.NA, regex=True)

    # --- Convert dates to datetime with explicit format ---
    date_cols = [
        'uen_issue_date'
    ]
    for col in date_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], format="%Y-%m-%d", errors='coerce')

    # --- Normalize UEN ---
    if 'uen' in df.columns:
        df['uen'] = df['uen'].str.strip().str.upper()

    # --- Clean entity_name ---
    if 'entity_name' in df.columns:
        def normalize_entity_name(name: str) -> str:
            if pd.isna(name):
                return name
            name = name.strip()

            # collapse multiple spaces
            name = re.sub(r'\s+', ' ', name)

            # upper case for standardization
            name = name.upper()

            # normalize common suffixes: remove dots in LTD, PTE. -> PTE
            name = re.sub(r'\bPTE\.?\b', 'PTE', name)
            name = re.sub(r'\bLTD\.?\b', 'LTD', name)

            # remove trailing periods
            name = re.sub(r'\.+$', '', name)

            return name

        df['entity_name'] = df['entity_name'].apply(normalize_entity_name)

    # --- Numeric cleaning ---
    if 'no_of_officers' in df.columns:
        df['no_of_officers'] = pd.to_numeric(df['no_of_officers'], errors='coerce')

    # --- Identify suspicious duplicates ---
    df_suspect_dupes = pd.DataFrame()
    if 'uen' in df.columns and 'entity_name' in df.columns:
        dupes = df[df.duplicated(subset=['uen'], keep=False)]
        if not dupes.empty:
            # group by UEN and keep only groups with more than 1 unique entity_name
            suspect_groups = dupes.groupby('uen').filter(lambda g: g['entity_name'].nunique() > 1)
            df_suspect_dupes = suspect_groups.sort_values('uen')

    # --- Deduplicate based on (uen, entity_name, entity_status_description) ---
    dedupe_cols = ['uen', 'entity_name', 'entity_status_description']
    dedupe_cols = [c for c in dedupe_cols if c in df.columns]
    if dedupe_cols:
        df_cleaned = df.drop_duplicates(subset=dedupe_cols, keep='first')
    else:
        df_cleaned = df.copy()

    return df_cleaned, df_suspect_dupes

df_cleaned, df_suspect_dupes = clean_acra_data(df)

print("Cleaned shape:", df_cleaned.shape)
print("Suspect duplicates shape:", df_suspect_dupes.shape)

# Example: review suspect duplicates
df_suspect_dupes.head(20)

# Check if NaN/NaT remain
nan_columns = df_cleaned.columns[df_cleaned.isna().any()].tolist()
print("Columns still containing NaN/NaT:", nan_columns)
print(df_cleaned[nan_columns].isna().sum())

# Show a few example rows
example_rows = df_cleaned[df_cleaned[nan_columns].isna().any(axis=1)].head(10)
print(example_rows)

Cleaned shape: (1674891, 9)
Suspect duplicates shape: (0, 0)
Columns still containing NaN/NaT: ['entity_name']
entity_name    1
dtype: int64
         vault_id        uen issuance_agency_desc uen_status_desc entity_name  \
1277918   1277919  53347159A                 ACRA    Deregistered         NaN   

                         entity_type_desc uen_issue_date  \
1277918  Sole Proprietorship/ Partnership     2016-09-30   

              reg_street_name reg_postal_code  
1277918  BEDOK SOUTH AVENUE 1          460003  


In [None]:
from sqlalchemy import Column, String, Date, Integer, create_engine
from sqlalchemy.orm import declarative_base, sessionmaker

Base = declarative_base()

class Entity(Base):
    __tablename__ = "entities"
    
    vault_id = Column(Integer, primary_key=True)
    uen = Column(String, unique=True, nullable=False)
    issuance_agency_desc = Column(String)
    uen_status_desc = Column(String)
    entity_name = Column(String)
    entity_type_desc = Column(String)
    uen_issue_date = Column(Date)
    reg_street_name = Column(String)
    reg_postal_code = Column(String)

# connect
engine = db.create_engine('postgresql+psycopg2://postgres:password@localhost:5432/procurement')

# create table
Base.metadata.create_all(engine)

# insert from dataframe
Session = sessionmaker(bind=engine)
session = Session()

records = df.to_dict(orient="records")
session.bulk_insert_mappings(Entity, records)
session.commit()