In [None]:
## AIIB InfraTech Portal – Data Collection Pipeline

This notebook builds a Python data pipeline to:
- Ingest company-level data from the AIIB InfraTech public API
- Handle pagination and request throttling
- Normalise nested JSON into tabular form
- Apply text sanitisation for Excel compatibility
- Export clean datasets for analysis

Note: The company list endpoint does not expose thematic tags; 
full classification would require enrichment via a detail-level API.

In [2]:
import requests
import pandas as pd

In [1]:
import sys
print(sys.executable)


/Library/Frameworks/Python.framework/Versions/3.13/bin/python3.13


In [3]:
!pip3 install requests pandas openpyxl



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [4]:
import requests
import pandas as pd


In [5]:
import requests
import pandas as pd
import time

URL = "https://www.infratechportal.org/itp/providerInfo/companyList"

# Base payload from your DevTools
BASE_PAYLOAD = {
    "name": "",
    "orderField": "timeDesc",
    "pageNo": 1,
    "pageSize": 10,       # will override to 100 per page
    "status": "",
    "type": "",
    "subSectorList": [],
    "subThemeList": [],
    "tagList": [],
    "techTags": [],
    "isSearch": False,
    "keyWord": "",
}


In [6]:
import requests
import pandas as pd
import time

URL = "https://www.infratechportal.org/itp/providerInfo/companyList"

# Base payload from your DevTools
BASE_PAYLOAD = {
    "name": "",
    "orderField": "timeDesc",
    "pageNo": 1,
    "pageSize": 10,       # will override to 100 per page
    "status": "",
    "type": "",
    "subSectorList": [],
    "subThemeList": [],
    "tagList": [],
    "techTags": [],
    "isSearch": False,
    "keyWord": "",
    "queryType": "1"
}

HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "Content-Type": "application/json"
}

all_data = []
page = 1
page_size = 100  # Faster: 100 per page


while True:
    print(f"Fetching page {page}...")

    payload = BASE_PAYLOAD.copy()
    payload["pageNo"] = page
    payload["pageSize"] = page_size

    r = requests.post(URL, json=payload, headers=HEADERS)
    r.raise_for_status()

    j = r.json()
    items = j.get("data", [])

    if not items:
        print("No more data returned.")
        break

    all_data.extend(items)

    total = j.get("total", None)
    if total and len(all_data) >= total:
        print("Reached total count.")
        break

    page += 1
    time.sleep(0.2)   # avoid hitting server too fast


# Convert to DataFrame
df = pd.json_normalize(all_data)
print(f"Downloaded {len(df)} companies.")

import re

ILLEGAL = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F]")

def clean_excel_text(x):
    if isinstance(x, str):
        return ILLEGAL.sub(" ", x).replace("\u00a0", " ").strip()
    return x

# Clean for Excel
df_clean = df.map(clean_excel_text)

# Save cleaned data
df_clean.to_excel("infratech_companies.xlsx", index=False)
df_clean.to_csv("infratech_companies.csv", index=False)

df_clean.head()


Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Fetching page 11...
Fetching page 12...
Reached total count.
Downloaded 1184 companies.


Unnamed: 0,providerNo,id,name,website,type,status,companyLogo,companyLogoName,companyLogoSize,techTags,sector,subSector,crossSectorThemes,crossSectorThemeFs,solutionComponents,caseInfoList,email,brefDesc,isChg
0,1763375889811,1990368858593439746,Chakr Innovation,https://chakr.in/chakr-shield/,,SOLUTION_CASE_PROVIDER_STATUS_PUBLISHED,/itp/common/att/download?id=T0NOa1A4aHZsVHBueV...,,,[combustion machinery],[Energy],,,,"[{'id': '1990297851996286978', 'solutionName':...",[],,"In the rapidly evolving world of cleantech, Ch...",
1,1763349822569,1990259524658982913,Green Gravity,https://greengravity.com/about/,,SOLUTION_CASE_PROVIDER_STATUS_PUBLISHED,/itp/common/att/download?id=Q3lFQ1RrQ0JYcUM0Q1...,,,[],[Energy],,,,"[{'id': '1990259524495405057', 'solutionName':...",[],,Green Gravity is enabling the global renewable...,
2,1763115385899,1989276225993322498,Energy Vault,https://www.energyvault.com/products/g-vault-g...,,SOLUTION_CASE_PROVIDER_STATUS_PUBLISHED,/itp/common/att/download?id=Q1hrVERsRkd4N05kVk...,,,[],[Energy],,,,"[{'id': '1990259524495405057', 'solutionName':...","[{'id': '1989276225968156673', 'caseName': 'Th...",,"At Energy Vault®, we envision a planet where s...",
3,1736840311481,1879070576261640194,Gravitricity,https://gravitricity.com/,,SOLUTION_CASE_PROVIDER_STATUS_PUBLISHED,/itp/common/att/download?id=TThwd1RHaGtTTHpLbU...,,,[],[],,,,[],[],,As the world generates more electricity from r...,
4,1754560869306,1953395982833463298,BGI,https://www.bgi.com,,SOLUTION_CASE_PROVIDER_STATUS_PUBLISHED,/itp/common/att/download?id=aTF5RGFhblZlUDIrRF...,,,[advanced materials],[Other Productive Sectors],,,,"[{'id': '1953393906731401217', 'solutionName':...","[{'id': '1953397517163118593', 'caseName': 'De...",,"Founded in 1999, BGI is one of the world's lea...",


In [7]:
import re

# Remove all illegal Excel control characters
illegal_chars = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F]')

for col in df.columns:
    df[col] = df[col].astype(str).apply(lambda x: illegal_chars.sub("", x))


In [8]:
df.to_excel("infratech_companies.xlsx", index=False)
df.to_csv("infratech_companies.csv", index=False)


In [9]:
import requests
import pandas as pd
import time
import re

URL = "https://www.infratechportal.org/itp/providerInfo/companyList"

BASE_PAYLOAD = {
    "name": "",
    "orderField": "timeDesc",
    "pageNo": 1,
    "pageSize": 10,
    "status": "",
    "type": "",
    "subSectorList": [],
    "subThemeList": [],
    "tagList": [],
    "techTags": [],
    "isSearch": False,
    "keyWord": "",
    "queryType": "1"
}

HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "Content-Type": "application/json"
}

all_data = []
page = 1
page_size = 100

while True:
    print(f"Fetching page {page}...")

    payload = BASE_PAYLOAD.copy()
    payload["pageNo"] = page
    payload["pageSize"] = page_size

    r = requests.post(URL, json=payload, headers=HEADERS)
    r.raise_for_status()

    j = r.json()
    items = j.get("data", [])

    if not items:
        print("No more data returned.")
        break

    all_data.extend(items)

    total = j.get("total", None)
    if total and len(all_data) >= total:
        print("Reached total count.")
        break

    page += 1
    time.sleep(0.2)

df = pd.json_normalize(all_data)
print(f"Downloaded {len(df)} companies.")

# CLEAN ILLEGAL CHARACTERS
illegal_chars = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F]')
for col in df.columns:
    df[col] = df[col].astype(str).apply(lambda x: illegal_chars.sub("", x))

# SAVE
df.to_excel("infratech_companies.xlsx", index=False)
df.to_csv("infratech_companies.csv", index=False)

df.head()


Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Fetching page 11...
Fetching page 12...
Reached total count.
Downloaded 1184 companies.


Unnamed: 0,providerNo,id,name,website,type,status,companyLogo,companyLogoName,companyLogoSize,techTags,sector,subSector,crossSectorThemes,crossSectorThemeFs,solutionComponents,caseInfoList,email,brefDesc,isChg
0,1763375889811,1990368858593439746,Chakr Innovation,https://chakr.in/chakr-shield/,,SOLUTION_CASE_PROVIDER_STATUS_PUBLISHED,/itp/common/att/download?id=T0NOa1A4aHZsVHBueV...,,,['combustion machinery'],['Energy'],,,,"[{'id': '1990297851996286978', 'solutionName':...",[],,"In the rapidly evolving world of cleantech, Ch...",
1,1763349822569,1990259524658982913,Green Gravity,https://greengravity.com/about/,,SOLUTION_CASE_PROVIDER_STATUS_PUBLISHED,/itp/common/att/download?id=Q3lFQ1RrQ0JYcUM0Q1...,,,[],['Energy'],,,,"[{'id': '1990259524495405057', 'solutionName':...",[],,Green Gravity is enabling the global renewable...,
2,1763115385899,1989276225993322498,Energy Vault,https://www.energyvault.com/products/g-vault-g...,,SOLUTION_CASE_PROVIDER_STATUS_PUBLISHED,/itp/common/att/download?id=Q1hrVERsRkd4N05kVk...,,,[],['Energy'],,,,"[{'id': '1990259524495405057', 'solutionName':...","[{'id': '1989276225968156673', 'caseName': ""Th...",,"At Energy Vault®, we envision a planet where s...",
3,1736840311481,1879070576261640194,Gravitricity,https://gravitricity.com/,,SOLUTION_CASE_PROVIDER_STATUS_PUBLISHED,/itp/common/att/download?id=TThwd1RHaGtTTHpLbU...,,,[],[],,,,[],[],,As the world generates more electricity from r...,
4,1754560869306,1953395982833463298,BGI,https://www.bgi.com,,SOLUTION_CASE_PROVIDER_STATUS_PUBLISHED,/itp/common/att/download?id=aTF5RGFhblZlUDIrRF...,,,['advanced materials'],['Other Productive Sectors'],,,,"[{'id': '1953393906731401217', 'solutionName':...","[{'id': '1953397517163118593', 'caseName': 'De...",,"Founded in 1999, BGI is one of the world's lea...",


In [None]:
## AIIB InfraTech Portal – Data Collection Pipeline

This notebook builds a Python data pipeline to:
- Ingest company-level data from the AIIB InfraTech public API
- Handle pagination and request throttling
- Normalise nested JSON into tabular form
- Apply text sanitisation for Excel compatibility
- Export clean datasets for analysis

Note: The company list endpoint does not expose thematic tags; 
full classification would require enrichment via a detail-level API.


In [10]:
len(df)

1184

In [11]:
df.columns


Index(['providerNo', 'id', 'name', 'website', 'type', 'status', 'companyLogo',
       'companyLogoName', 'companyLogoSize', 'techTags', 'sector', 'subSector',
       'crossSectorThemes', 'crossSectorThemeFs', 'solutionComponents',
       'caseInfoList', 'email', 'brefDesc', 'isChg'],
      dtype='object')

In [12]:
df.columns.tolist()


['providerNo',
 'id',
 'name',
 'website',
 'type',
 'status',
 'companyLogo',
 'companyLogoName',
 'companyLogoSize',
 'techTags',
 'sector',
 'subSector',
 'crossSectorThemes',
 'crossSectorThemeFs',
 'solutionComponents',
 'caseInfoList',
 'email',
 'brefDesc',
 'isChg']