<a href="https://colab.research.google.com/github/hollimey/542/blob/main/i4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
## information structure: xls - local access
!pip install pandas

import requests
import pandas as pd

# download file from URL
url = "https://www.usitc.gov/sites/default/files/tata/hts/hts_2025_revision_10_xls.xlsx"
response = requests.get(url)

# save the downloaded content to a local file
with open("hts_2025.xlsx", "wb") as f:
    f.write(response.content)

# read file from local file system into a pandas data frame
df = pd.read_excel("hts_2025.xlsx")

# print sample of the data
print("\n=== First 5 Rows ===")
print(df.head())


## pro of local file access:
# 1. the process allows you to retain a physical copy of the file on the disk which makes it easy to inspect manually if needed to debug.
# 2. the file remains available for future use even after the script ends, making it useful for documentation, backups, etc.
# 3. it is compatible with a lot of libraries since it utilizes a simple file path.

## con of the local file access:
# 1. the disk operation can be slower by increasing latency in larger file in comparison to working in memory.
# 2. since the file is not automatically deleted it can cause clutter and requires manual deletion.
# 3. saving external files locally can increase the risk of security issues.



  warn("Workbook contains no default style, apply openpyxl's default")



=== First 5 Rows ===
      HTS Number  Indent                             Description  \
0           0101       0  Live horses, asses, mules and hinnies:   
1            NaN       1                                 Horses:   
2     0101.21.00       2               Purebred breeding animals   
3  0101.21.00.10       3                                   Males   
4  0101.21.00.20       3                                 Females   

  Unit of Quantity General Rate of Duty Special Rate of Duty  \
0              NaN                  NaN                  NaN   
1              NaN                  NaN                  NaN   
2              NaN                 Free                  NaN   
3          ["No."]                  NaN                  NaN   
4          ["No."]                  NaN                  NaN   

  Column 2 Rate of Duty  Quota Quantity Additional Duties  
0                   NaN             NaN               NaN  
1                   NaN             NaN               NaN  
2   

In [None]:
# information structure: csv - HTTP based API access

import requests
import pandas as pd
from io import BytesIO

def download_cdc_smoking_data():
    # direct download link extracted from the webpage
    CSV_URL = "https://data.cdc.gov/api/views/ezab-8sq5/rows.csv?accessType=DOWNLOAD"

    try:
        response = requests.get(CSV_URL)
        response.raise_for_status()  # check for errors

        # load CSV directly into pandas
        df = pd.read_csv(BytesIO(response.content))

        # print sample of the data
        print("\n=== First 5 Rows ===")
        print(df.head())

        # save to local CSV file
        df.to_csv("cdc_smoking_data.csv", index=False)

    except Exception as e:
        print(f"Error: {e}")

# run the function
if __name__ == "__main__":
    download_cdc_smoking_data()


## pro of HTTP based API access
# 1. allows for up to date data access, or real-time.
# 2. no local storage is required as data can be processed directly from the API.
# 3. data transferred over HTTPS is more secure.

## con of HTTP based API access
# 1. the process of retreving data is dependent of network connection wihch can be delayed or fail if internet is not stabel or reliable enough.
# 2. API may have higher latency due to network round trips, which could be an issue in time-sensitive applications.
# 3. APIs can be unreliable if issues arise with the provider.


=== First 5 Rows ===
   Year LocationAbbr   LocationDesc DataSource                      TopicType  \
0  2009           RI   Rhode Island     SAMMEC  Health Consequences and Costs   
1  2007           SD   South Dakota     SAMMEC  Health Consequences and Costs   
2  2009           NH  New Hampshire     SAMMEC  Health Consequences and Costs   
3  2006           MD       Maryland     SAMMEC  Health Consequences and Costs   
4  2009           LA      Louisiana     SAMMEC  Health Consequences and Costs   

                                  TopicDesc      MeasureDesc  \
0  Smoking-Attributable Expenditures (SAEs)  Type of Expense   
1  Smoking-Attributable Expenditures (SAEs)  Type of Expense   
2  Smoking-Attributable Expenditures (SAEs)  Type of Expense   
3  Smoking-Attributable Expenditures (SAEs)  Type of Expense   
4  Smoking-Attributable Expenditures (SAEs)  Type of Expense   

             Variable Data_Value_Unit      Data_Value_Type  Data_Value  \
0               Other           

In [10]:
# information structure: parquet - AWS S3 access

!pip install awscli
!pip install boto3 pandas pyarrow

!aws s3 ls s3://aodn-cloud-optimised/animal_ctd_satellite_relay_tagging_delayed_qc.parquet/ --no-sign-request

import boto3
import pandas as pd
from botocore import UNSIGNED
from botocore.config import Config

# set up client
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

bucket = 'aodn-cloud-optimised'
prefix = 'animal_ctd_satellite_relay_tagging_delayed_qc.parquet/'

# list files in the specified S3 directory
response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)

# Get the first Parquet file
parquet_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.parquet')]
if not parquet_files:
    raise Exception("No .parquet files found in the S3 bucket.")

sample_file_key = parquet_files[0]  # Pick the first one

# Download the file locally
s3.download_file(bucket, sample_file_key, 'sample.parquet')

# Read a sample from the Parquet file
df = pd.read_parquet('sample.parquet')
print(df.head())


## pro of AWS S3 access
# 1. it offers scalability since there is not limitation on data space.
# 2. S3 is highly available without ever any rela significant downtime.
# 3. offers easy integration as it's supported by a lot of different tools.

## con of AWS S3 access
# 1. has a limited file size of 5TB of data to work with.
# 2. the prorgam is more complex to write in comparison to the other access methods.

                           PRE timestamp=1072915200/
                           PRE timestamp=1104537600/
                           PRE timestamp=1136073600/
                           PRE timestamp=1167609600/
                           PRE timestamp=1199145600/
                           PRE timestamp=1230768000/
                           PRE timestamp=1262304000/
                           PRE timestamp=1293840000/
                           PRE timestamp=1325376000/
                           PRE timestamp=1356998400/
                           PRE timestamp=1388534400/
                           PRE timestamp=1420070400/
                           PRE timestamp=1451606400/
                           PRE timestamp=1483228800/
                           PRE timestamp=1514764800/
                           PRE timestamp=1546300800/
                           PRE timestamp=1577836800/
                           PRE timestamp=1609459200/
                           PRE timestamp=16409