Utility file to validate downloaded raw Comtrade folders are complete. 

In [1]:
from downloader import ComtradeDownloader, API_KEYS

In [2]:
import os

import pandas as pd
import numpy as np
import comtradeapicall
import glob

%load_ext jupyter_black

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.set_option("max_colwidth", 400)

In [3]:
# generates dictionary with a list of all missing reporters that have data for a given year

# ENTER IN DATA INPUTS
year_start = 2021
year_end = 2023
classification_code = "S2"

output_dir = "/n/hausmann_lab/lab/atlas/data/"
raw_files_path = os.path.join(output_dir, "raw", classification_code)


def partially_downloaded_list(year_start, year_end):
    """
    Get information about last date raw files for the classification code
    and year were last downloaded
    """
    partial_dict = {}
    for year in range(year_start, year_end + 1):
        raw_file_by_year_path = os.path.join(raw_files_path, str(year))
        df = comtradeapicall.getFinalDataBulkAvailability(
            API_KEYS["Ellie"],
            typeCode="C",
            freqCode="A",
            clCode=classification_code,
            period=str(year),
            reporterCode=None,
        )
        if df.empty:
            print(f"No data for {year} in {classification_code}")
            continue
        available_reporters = df[["reporterCode"]]["reporterCode"].tolist()
        for file in glob.glob(os.path.join(raw_file_by_year_path, "*.gz")):
            # from file title, char 27:37 extract most recently updated date
            reporter_code = int(file.split("/")[-1][17:20])
            try:
                index_to_delete = available_reporters.index(reporter_code)
                del available_reporters[index_to_delete]
            except ValueError as e:
                print(f"{reporter_code} not in available reporters for year {year}")
                print(e)
        partial_dict[year] = available_reporters

    return partial_dict


partial_dict = partially_downloaded_list(year_start, year_end)
partial_dict

No data for 2023 in S2


{2021: [], 2022: []}

In [4]:
# Based on remaining files left to download identified by above function
# One year at a time, enter the year and complete the download for that year

classification_code = "S2"
year = 2021

output_dir = "/n/hausmann_lab/lab/atlas/data/"
raw_files_path = os.path.join(output_dir, "raw", classification_code)

for reporter_code in partial_dict[year]:
    comtradeapicall.bulkDownloadFinalFile(
        API_KEYS["Brendan"],
        os.path.join(raw_files_path, str(year)),
        typeCode="C",
        freqCode="A",
        clCode=classification_code,
        period=str(year),
        reporterCode=str(reporter_code),
        decompress=False,
    )
    time.sleep(1)

In [5]:
# deletes a directory

# import glob
# import os
# import logging

# logging.basicConfig(level=logging.DEBUG)

# def remove_tmp_dir(tmp_path):
#     """

#     """
#     for f in glob.glob(os.path.join(tmp_path, "*.gz")):
#         try:
#             os.remove(f)
#         except OSError as e:
#             logging.info(f"Error: {f} : {e.strerror}")

#     try:
#         os.rmdir(tmp_path)
#     except OSError as e:
#         logging.info(f"Error: {tmp_path} : {e.strerror}")

In [3]:
# check for any corrupted downloaded files given classification code and range of years

years = [2017, 2017]

corrupted_files = []  # List to store names of corrupted files
classification_code = "H0"

output_dir = "/n/hausmann_lab/lab/atlas/data/"

for year in range(2017, 2018):
    raw_files_path = os.path.join(output_dir, "raw", classification_code, str(year))
    print("YEAR ", year)
    for file in glob.glob(os.path.join(raw_files_path, "*.gz")):
        try:
            # Try reading the CSV file with Pandas
            pd.read_csv(file)
        except Exception as e:
            # If an exception occurs, consider the file corrupted and add its name to the list
            print(f"Error reading '{file}': {e}")
            corrupted_files.append(file)

if corrupted_files:
    print("Corrupted files:")
    for corrupted_file in corrupted_files:
        print(corrupted_file)
else:
    print("No corrupted files found.")

YEAR  2017
Error reading '/n/hausmann_lab/lab/atlas/data/raw/H0/2017/COMTRADE-FINAL-CA7242017H0[2018-08-15].gz': Compressed file ended before the end-of-stream marker was reached
Error reading '/n/hausmann_lab/lab/atlas/data/raw/H0/2017/COMTRADE-FINAL-CA7522017H0[2021-11-01].gz': Compressed file ended before the end-of-stream marker was reached
Corrupted files:
/n/hausmann_lab/lab/atlas/data/raw/H0/2017/COMTRADE-FINAL-CA7242017H0[2018-08-15].gz
/n/hausmann_lab/lab/atlas/data/raw/H0/2017/COMTRADE-FINAL-CA7522017H0[2021-11-01].gz
