In [1]:
"""
Run file from the ingestion directory
Uses comtradeapicall library to download Comtrade Data with ComtradeDownloader object
"""

"""
Troubleshooting Tips:
    -If import apicomtradeapicall is failing make sure Kernel is set to Python [conda env:root]. 
     Go to Kernel > Change Kernel
"""


'\nTroubleshooting Tips:\n    -If import apicomtradeapicall is failing make sure Kernel is set to Python [conda env:root]. \n     Go to Kernel > Change Kernel\n'

In [1]:
!pip install comtradeapicall
!pip install requests==2.31.0
import requests

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import requests
import comtradeapicall
import glob
import os
import shutil # need library 
import sys
import re
import time
from datetime import date, timedelta, datetime
import logging
import pyarrow

In [3]:
logging.basicConfig(level=logging.INFO)

In [4]:
class ComtradeDownloader(object):
    columns = {
        "period": "int16",
        "reporterCode": "int16",
        "flowCode": "category",
        "partnerCode": "int16",
        "partner2Code": "int16",
        "classificationCode": "string",
        "cmdCode": "string",
        "customsCode": "string",
        "mosCode": "int16",
        "motCode": "int16",
        "qtyUnitCode": "int8",
        "qty": "float64",
        "isQtyEstimated": "int8",
        "CIFValue": "float64",
        "FOBValue": "float64",
        "primaryValue": "float64",
    }

    def __init__(
        self,
        api_key,
        output_dir,
        classification_code,
        start_year,
        end_year,
        reporter_iso3_codes=[],
        partner_iso3_codes=[],
        commodity_codes=[],
        flow_codes=["M", "X"],
        mot_codes=[0],
        mos_codes=[0],
        customs_codes=["C00"],
        drop_world_partner=False,
        drop_secondary_partners=True,
        delete_tmp_files=True,
        compress_output=True,
        suppress_print=True,
        force_full_download=False
    ):
        self.api_key = api_key
        self.output_dir = output_dir
        self.run_time = time.strftime("%Y-%m-%d_%H_%M_%S", time.gmtime())
        # self.download_report_path = os.path.join(
            # self.output_dir, f"download_report_{self.run_time}.csv"
        self.download_report_path = os.path.join(
            self.output_dir, f"download_report_{self.run_time}.csv"
        )
        # self.download_report_path = os.path.join(
        #     self.output_dir, f"download_report_{self.run_time}.csv"
        # )

        self.classification_code = classification_code
        self.years = range(start_year, end_year + 1)
        self.reporter_iso3_codes = reporter_iso3_codes
        self.partner_iso3_codes = partner_iso3_codes
        self.commodity_codes = commodity_codes
        self.flow_codes = flow_codes
        self.mot_codes = mot_codes
        self.mos_codes = mos_codes
        self.customs_codes = customs_codes

        self.drop_world_partner = drop_world_partner
        self.drop_secondary_partners = drop_secondary_partners

        self.delete_tmp_files = delete_tmp_files
        self.suppress_print = suppress_print
        self.force_full_download = force_full_download


        if compress_output:
            self.file_extension = "gz"
        else:
            self.file_extension = "csv"

        ###############################################################

        self.reporters = comtradeapicall.getReference("reporter")[
            ["reporterCode", "reporterCodeIsoAlpha3"]
        ].rename(columns={"reporterCodeIsoAlpha3": "reporterISO3"})

        self.partners = comtradeapicall.getReference("partner")[
            ["PartnerCode", "PartnerCodeIsoAlpha3"]
        ].rename(
            columns={
                "PartnerCode": "partnerCode",
                "PartnerCodeIsoAlpha3": "partnerISO3",
            }
        )

        ###############################################################

        # Make directory for most recently updated raw files
        self.latest_path = os.path.join(self.output_dir, "latest_raw", self.classification_code)
        os.makedirs(self.latest_path, exist_ok=True)
        
        # Make directory for raw files used by extractor script
        self.raw_files_path = os.path.join(self.output_dir, "raw", self.classification_code)
        os.makedirs(self.raw_files_path, exist_ok=True)
        
        # Make directory for archiving out of date raw files
        self.archived_path = os.path.join(self.output_dir, "archived_raw", self.classification_code)
        os.makedirs(self.archived_path, exist_ok=True)

        # # Remove temporary directory
        # if self.delete_tmp_files:
        #     self.remove_tmp_dir(self.tmp_path)

        
    def download_comtrade_bilateral_totals(self):
        """
        
        """
        dfs = []

        for year in self.years:
            year_data = comtradeapicall.getFinalData(
                self.api_key,
                typeCode="C",
                freqCode="A",
                clCode="HS",
                period=str(year),
                reporterCode=None,
                cmdCode="TOTAL",
                flowCode=None,
                partnerCode=None,
                partner2Code=[0],
                customsCode="C00",
                motCode=0,
            )
            year_data = year_data.rename(
                columns={"cifvalue": "CIFValue", "fobvalue": "FOBValue"}
            )
            year_data = year_data[self.columns.keys()]

            dfs.append(year_data)
            logging.info("Completed downloading Commodity Totals for {}".format(year))
        df = pd.concat(dfs)

        # Merge reporter and partner reference tables for ISO3 codes
        df = df.merge(self.reporters, on="reporterCode", how="left")
        df = df.merge(self.partners, on="partnerCode", how="left")

        # df.to_csv(
        #     f"/n/hausmann_lab/lab/atlas/data/comtrade_HS_totals.{self.file_extension}",
        #     index=False,
        # )
        df.to_csv(
            os.path.join(self.output_dir, f"comtrade_HS_totals.{self.file_extension}"),
            index=False,
        )
        logging.info("Completed downloading Commodity Totals for {}".format(self.years))

        
    def download_comtrade_yearly_bilateral_flows(self):
        """
        
        """
        class HiddenPrints:
            def __enter__(self):
                self._original_stdout = sys.stdout
                sys.stdout = open(os.devnull, "w")

            def __exit__(self, exc_type, exc_val, exc_tb):
                sys.stdout.close()
                sys.stdout = self._original_stdout

        # if self.reporter_iso3_codes:
        #     reporter_codes = comtradeapicall.convertCountryIso3ToCode(
        #         ",".join(self.reporter_iso3_codes)
        #     )
        # else:
        #     reporter_codes = None
        reporter_codes=[]
        for year in self.years:
            last_updated = self.get_date_of_last_download(year)
            if last_updated is not None and not self.force_full_download:
                self.get_reporters_by_data_availability(year, last_updated)
                year_path = os.path.join(self.latest_path, str(year))
                logging.info(f"Downloading reporter {self.classification_code} - {year} " +
                             f"files updated since {last_updated}.")
            else:
                last_updated=None
                year_path = os.path.join(self.raw_files_path, str(year))
                logging.info(f"Downloading all {self.classification_code} - {year}.")
            os.makedirs(year_path, exist_ok=True)
            try:
                if self.suppress_print:
                    with HiddenPrints():
                        comtradeapicall.bulkDownloadFinalFile(
                            self.api_key,
                            year_path,
                            typeCode="C",
                            freqCode="A",
                            clCode=self.classification_code,
                            period=str(year),
                            reporterCode=reporter_codes,
                            decompress=False,
                            publishedDateFrom=last_updated
                            # publishedDateTo='2018-01-01'
                        )
                else:
                    comtradeapicall.bulkDownloadFinalFile(
                        self.api_key,
                        year_path,
                        typeCode="C",
                        freqCode="A",
                        clCode=self.classification_code,
                        period=str(year),
                        reporterCode=reporter_codes,
                        decompress=False,
                        publishedDateFrom=last_updated
                        # publishedDateTo='2018-01-01'

                    )
                logging.info(f"Completed downloading year {year}.")
            except KeyError as e:
                logging.info(f"An error occurred: {str(e)}")
            
            if last_updated is not None and not self.force_full_download:
                relocated_files = self.replace_raw_files_with_updated_reports(year, year_path)
                self.generate_comtrade_commodity_download_report(year_path, relocated_files)
            else:
                self.generate_comtrade_commodity_download_report(year_path, [])
            logging.info(f"Generated download report for {year}.")

            logging.info(f"Filtering and exporting data for year {year}.")
            self.combine_clean_comtrade_commodity_year(year)

            # logging.info(f"Cleaning up year {year}.")
            # if self.delete_tmp_files:
            #     self.remove_tmp_dir(year_path)


    def get_date_of_last_download(self, year):
        """
        Get information about last date raw files for the classification code 
        and year were last downloaded
        """
        raw_file_by_year_path = os.path.join(self.raw_files_path, str(year))
        raw_file_names = []
        if os.path.exists(raw_file_by_year_path):
            raw_file_names = os.listdir(raw_file_by_year_path)
            filtered_files = [file for file in raw_file_names if not file.startswith('.')]
            number_files = len(filtered_files)
            
            if number_files > 0:
                logging.info(f"For {self.classification_code} - {year} {number_files} country reporter files are downloaded")
                latest_updated_date = "0000-00-00"
                for file in glob.glob(os.path.join(raw_file_by_year_path, '*.gz')):
                    # from file title, char 27:37 extract most recently updated date
                    updated_date = file[-14:-4]
                    if updated_date > latest_updated_date:
                        latest_updated_date = updated_date
                date = latest_updated_date.split('-')
                latest_date = datetime(int(date[0]), int(date[1]), int(date[2]))
                latest_date=latest_date + timedelta(1)
                return str(latest_date).split(' ')[0]
        else:
            latest_date = None
            return latest_date
        

    def get_reporters_by_data_availability(self, year, latest_date):
        df = comtradeapicall.getFinalDataBulkAvailability(
            self.api_key, 
            typeCode='C', 
            freqCode='A', 
            clCode=self.classification_code,
            period=str(year),
            reporterCode=None)
        df_since_download = df[df['timestamp'] > str(latest_date)]
        reporter_codes = df_since_download['reporterCode'].unique()
        return reporter_codes

    
    def replace_raw_files_with_updated_reports(self, year, year_path):
        """
        
        """
        relocated_files = []
        updated_file_names = os.listdir(year_path)
        if not updated_file_names:
            return relocated_files

        raw_file_by_year_path = os.path.join(self.raw_files_path, str(year))
        raw_file_names = os.listdir(raw_file_by_year_path)
        
        archive_by_year_dir = os.path.join(self.archived_path, str(year))

        file_name_to_index = {file_name[15:20]: index for index, file_name in enumerate(raw_file_names)}
        
        for file_name in updated_file_names:
            logging.debug("file name in updated file: ", file_name)
            latest_file = os.path.join(year_path, file_name)
            try:
                outdated_file_name = raw_file_names[file_name_to_index[file_name[15:20]]]
                logging.debug("file to replace: ", outdated_file_name)
                outdated_file = os.path.join(raw_file_by_year_path, outdated_file_name)
            except:
                # add the data to the raw output file
                outdated_file = None
                logging.debug("not in raw file", file_name)
                
            try:
                shutil.move(latest_file, raw_file_by_year_path)
                logging.debug(f"moved {latest_file} over")
                relocated_files.append(latest_file)
            except shutil.Error as e:
                logging.debug(f"did not move over {latest_file}")

            if not os.path.exists(archive_by_year_dir):
                os.makedirs(archive_by_year_dir)
            
            if outdated_file:
                try:
                    shutil.move(outdated_file, archive_by_year_dir)
                    logging.debug(f"archiving {outdated_file}")
                    relocated_files.append(outdated_file)
                except shutil.Error as e:
                    logging.debug("Error: ", e)
        logging.info(f"Replacing any outdated raw files with latest data")
        return relocated_files
            
            
    def combine_clean_comtrade_commodity_year(self, year):
        dfs = []

        year_path = os.path.join(self.raw_files_path, str(year))

        for f in glob.glob(os.path.join(year_path, "*.gz")):
            df = pd.read_csv(
                f,
                sep="\t",
                compression="gzip",
                usecols=list(self.columns.keys()),
                dtype=self.columns,
            )

            # Filter unneeded data before appending to keep what is stored in memory
            # as low as possible.
            if self.commodity_codes:
                df = df[df.cmdCode.isin(self.commodity_codes)]
            if self.flow_codes:
                df = df[df.flowCode.isin(self.flow_codes)]
            if self.mot_codes:
                df = df[df.motCode.isin(self.mot_codes)]
            if self.mos_codes:
                df = df[df.mosCode.isin(self.mos_codes)]
            if self.customs_codes:
                df = df[df.customsCode.isin(self.customs_codes)]

            if self.drop_world_partner:
                df = df[df.partnerCode != 0]

            if self.drop_secondary_partners:
                df = df[df.partner2Code == 0]
                df = df.drop(columns=["partner2Code"])

            dfs.append(df)
        try:
            df = pd.concat(dfs)


            # Merge reporter and partner reference tables for ISO3 codes
            df = df.merge(self.reporters, on="reporterCode", how="left")
            df = df.merge(self.partners, on="partnerCode", how="left")

            if self.partner_iso3_codes:
                df = df[df.partnerISO3.isin(self.partner_iso3_codes)]

            if not self.drop_secondary_partners:
                df = df.merge(
                    self.partners.rename(
                        columns={
                            "partnerCode": "partner2Code",
                            "partnerISO3": "partner2ISO3",
                        }
                    ),
                    on="partner2Code",
                    how="left",
                )

            logging.info(f"Saving transformed data file for {year}.")
            
        except:
            df = pd.DataFrame()
            logging.info(f"No data was downloaded for {year}")

        # df.to_csv(
        #     os.path.join(
        #         "./n/hausmann_lab/lab/atlas/data/",
        #         f"comtrade_{self.classification_code}_{year}.{self.file_extension}",
        #     ),
        #     index=False,
        # )

        df.to_csv(
            os.path.join(
                self.output_dir,
                f"comtrade_{self.classification_code}_{year}.{self.file_extension}",
            ),
            index=False,
        )

        del df


    def generate_comtrade_commodity_download_report(self, year_path, replaced_files):
        """
        Generates a download log report as a csv
        """
        
        data = []

        try:
            report = pd.read_csv(self.download_report_path)
        except:
            report = None

        download_time = time.gmtime()
        if not replaced_files:
            for f in glob.glob(os.path.join(year_path, "*.gz")):
                m = re.match(
                    "COMTRADE-FINAL-CA(?P<reporterCode>\d{3})(?P<year>\d{4})"
                    "(?P<classificationCode>\w+)"
                    "\[(?P<lastUpdateYear>\d{4})-(?P<lastUpdateMonth>\d{2})-"
                    "(?P<lastUpdateDay>\d{2})\]\.gz",
                    f.split("/")[-1],
                )
                data.append(m.groupdict())
        else:
            for file in replaced_files:
                m = re.match(
                    "COMTRADE-FINAL-CA(?P<reporterCode>\d{3})(?P<year>\d{4})"
                    "(?P<classificationCode>\w+)"
                    "\[(?P<lastUpdateYear>\d{4})-(?P<lastUpdateMonth>\d{2})-"
                    "(?P<lastUpdateDay>\d{2})\]\.gz",
                    file.split("/")[-1],
                )
                data.append(m.groupdict())

        df = pd.DataFrame(data)
        if df.empty:
            logging.info('No updated reports were downloaded')
        else:
            df["lastUpdate"] = pd.to_datetime(
                df[["lastUpdateYear", "lastUpdateMonth", "lastUpdateDay"]].rename(
                    columns={
                        "lastUpdateYear": "year",
                        "lastUpdateMonth": "month",
                        "lastUpdateDay": "day",
                    }
                )
            )
            df = df.drop(columns=["lastUpdateYear", "lastUpdateMonth", "lastUpdateDay"])
            df["downloadTime"] = time.strftime("%Y-%m-%d %H:%M:%S", download_time)

        if report is not None:
            report = pd.concat([report, df])
        else:
            report = df

        report.to_csv(self.download_report_path, index=False)
        del report

        
    def remove_tmp_dir(self, tmp_path):
        """
        
        """
        for f in glob.glob(os.path.join(tmp_path, "*.gz")):
            try:
                os.move(f)
            except OSError as e:
                logging.info(f"Error: {f} : {e.strerror}")

        try:
            os.rmdir(tmp_path)
        except OSError as e:
            logging.info(f"Error: {tmp_path} : {e.strerror}")


In [5]:
API_KEYS = {
    "Brendan": "ecd16be0c5ec4ed5b95dd7eb23d98fbf",
    "Ellie": "fa60b196283d493c9f65ad7acfa3d76d",
    "Seba": "S" }

In [26]:
# UPDATE USER:
user_api_key = API_KEYS["Ellie"]

# SET DATE RANGE
requested_start_year = 2022
request_end_year = 2022


# Generate API Object Caller with Desired Parameters
downloader = ComtradeDownloader(
    api_key=user_api_key,
    output_dir="/n/hausmann_lab/lab/atlas/data/",
    classification_code="S2", #H0: HS92, H4: HS12 SITC Rev 2: S2
    start_year=requested_start_year,
    end_year=request_end_year,
    reporter_iso3_codes=[],
    partner_iso3_codes=[],
    commodity_codes=[],
    flow_codes=[], # exports (X), imports (M), Cost of Insurance-Freight (CA)
    mot_codes=[0],
    mos_codes=[0],
    customs_codes=[],
    drop_world_partner=False,
    drop_secondary_partners=True,
    delete_tmp_files=False,
    compress_output=True,
    suppress_print=True,
    force_full_download=True
)

In [9]:
# Notes

# remove directory of raw folder, that gets entire fresh new download 
# adapt log file/ summary of download
# clear out folders: latest_raw is empty

/n/hausmann_lab/lab/atlas/data/raw/H4/2016/COMTRADE-FINAL-CA1702016H4[2017-05-05].gz
/n/hausmann_lab/lab/atlas/data/raw/H4/2016/COMTRADE-FINAL-CA1402016H4[2017-10-17].gz
/n/hausmann_lab/lab/atlas/data/raw/H4/2016/COMTRADE-FINAL-CA6882016H4[2017-05-04].gz
/n/hausmann_lab/lab/atlas/data/raw/H4/2016/COMTRADE-FINAL-CA2682016H4[2022-10-07].gz
/n/hausmann_lab/lab/atlas/data/raw/H4/2016/COMTRADE-FINAL-CA2042016H4[2022-03-31].gz
/n/hausmann_lab/lab/atlas/data/raw/H4/2016/COMTRADE-FINAL-CA6202016H4[2019-01-03].gz
/n/hausmann_lab/lab/atlas/data/raw/H4/2016/COMTRADE-FINAL-CA2752016H4[2017-10-26].gz
/n/hausmann_lab/lab/atlas/data/raw/H4/2016/COMTRADE-FINAL-CA1882016H4[2017-05-23].gz
/n/hausmann_lab/lab/atlas/data/raw/H4/2016/COMTRADE-FINAL-CA0362016H4[2017-03-16].gz
/n/hausmann_lab/lab/atlas/data/raw/H4/2016/COMTRADE-FINAL-CA6462016H4[2017-05-03].gz
/n/hausmann_lab/lab/atlas/data/raw/H4/2016/COMTRADE-FINAL-CA3642016H4[2019-02-06].gz
/n/hausmann_lab/lab/atlas/data/raw/H4/2016/COMTRADE-FINAL-CA82620

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [27]:
# classifications = ['H0', 'H1', 'H2', 'H3', 'H4']
# downloader.download_comtrade_bilateral_totals()

downloader.download_comtrade_yearly_bilateral_flows()

INFO:root:Downloading all S2 - 2022.
INFO:root:Completed downloading year 2022.
INFO:root:Generated download report for 2022.
INFO:root:Filtering and exporting data for year 2022.


/n/hausmann_lab/lab/atlas/data/raw/S2/2022/COMTRADE-FINAL-CA3002022S2[2023-02-16].gz
/n/hausmann_lab/lab/atlas/data/raw/S2/2022/COMTRADE-FINAL-CA5792022S2[2023-10-17].gz
/n/hausmann_lab/lab/atlas/data/raw/S2/2022/COMTRADE-FINAL-CA7842022S2[2023-11-13].gz
/n/hausmann_lab/lab/atlas/data/raw/S2/2022/COMTRADE-FINAL-CA4102022S2[2023-08-03].gz
/n/hausmann_lab/lab/atlas/data/raw/S2/2022/COMTRADE-FINAL-CA0842022S2[2023-10-10].gz
/n/hausmann_lab/lab/atlas/data/raw/S2/2022/COMTRADE-FINAL-CA8542022S2[2023-10-10].gz
/n/hausmann_lab/lab/atlas/data/raw/S2/2022/COMTRADE-FINAL-CA2422022S2[2023-04-06].gz
/n/hausmann_lab/lab/atlas/data/raw/S2/2022/COMTRADE-FINAL-CA6342022S2[2023-06-21].gz
/n/hausmann_lab/lab/atlas/data/raw/S2/2022/COMTRADE-FINAL-CA2332022S2[2023-10-17].gz
/n/hausmann_lab/lab/atlas/data/raw/S2/2022/COMTRADE-FINAL-CA7242022S2[2023-02-23].gz
/n/hausmann_lab/lab/atlas/data/raw/S2/2022/COMTRADE-FINAL-CA4002022S2[2023-11-21].gz
/n/hausmann_lab/lab/atlas/data/raw/S2/2022/COMTRADE-FINAL-CA45420

INFO:root:Saving transformed data file for 2022.


In [30]:
# generates dictionary with a list of all missing reporters that have data for a given year

#ENTER IN DATA INPUTS
year_start = 2021
year_end = 2023
classification_code = 'S2'

output_dir="/n/hausmann_lab/lab/atlas/data/"
raw_files_path = os.path.join(output_dir, "raw", classification_code)

def partially_downloaded_list(year_start, year_end):
    """
    Get information about last date raw files for the classification code 
    and year were last downloaded
    """
    partial_dict = {}
    for year in range(year_start, year_end + 1):
        raw_file_by_year_path = os.path.join(raw_files_path, str(year))
        # reporters_list = comtradeapicall.getReference("reporter")[["reporterCode"]]['reporterCode'].tolist()
        df = comtradeapicall.getFinalDataBulkAvailability(
                API_KEYS["Ellie"], 
                typeCode='C', 
                freqCode='A', 
                clCode=classification_code,
                period=str(year),
                reporterCode=None)
        if df.empty:
            print(f"No data for {year} in {classification_code}")
            continue
        available_reporters = df[["reporterCode"]]['reporterCode'].tolist()
        for file in glob.glob(os.path.join(raw_file_by_year_path, "*.gz")):
            # from file title, char 27:37 extract most recently updated date
            reporter_code = int(file.split('/')[-1][17:20])
            try:
                index_to_delete = available_reporters.index(reporter_code)
                del available_reporters[index_to_delete]
            except ValueError as e:
                print(f"{reporter_code} not in available reporters for year {year}")
                print(e)
        partial_dict[year] = available_reporters

    return partial_dict

partial_dict = partially_downloaded_list(year_start, year_end)
partial_dict

No data for 2023 in S2


{2021: [], 2022: []}

In [29]:
# Based on remaining files left to download identified by above function
# One year at a time, enter the year and complete the download for that year

classification_code = "S2"
year = 2021

output_dir="/n/hausmann_lab/lab/atlas/data/"
raw_files_path = os.path.join(output_dir, "raw", classification_code)

for reporter_code in partial_dict[year]:
    comtradeapicall.bulkDownloadFinalFile(
        API_KEYS["Brendan"],
        os.path.join(raw_files_path, str(year)),
        typeCode="C",
        freqCode="A",
        clCode=classification_code,
        period=str(year),
        reporterCode=str(reporter_code),
        decompress=False,
    )
    time.sleep(1)

COMTRADE-FINAL-CA7102021S2[2022-11-16] downloaded
Total of 1 file(s) downloaded
COMTRADE-FINAL-CA7162021S2[2022-10-10] downloaded
Total of 1 file(s) downloaded
COMTRADE-FINAL-CA0722021S2[2023-08-29] downloaded
Total of 1 file(s) downloaded
COMTRADE-FINAL-CA7242021S2[2022-02-27] downloaded
Total of 1 file(s) downloaded
COMTRADE-FINAL-CA7402021S2[2023-05-13] downloaded
Total of 1 file(s) downloaded
COMTRADE-FINAL-CA7482021S2[2022-10-20] downloaded
Total of 1 file(s) downloaded
COMTRADE-FINAL-CA7522021S2[2023-11-04] downloaded
Total of 1 file(s) downloaded
COMTRADE-FINAL-CA7572021S2[2022-11-11] downloaded
Total of 1 file(s) downloaded
COMTRADE-FINAL-CA0762021S2[2023-08-30] downloaded
Total of 1 file(s) downloaded
COMTRADE-FINAL-CA7622021S2[2022-10-11] downloaded
Total of 1 file(s) downloaded
COMTRADE-FINAL-CA7642021S2[2022-10-06] downloaded
Total of 1 file(s) downloaded
COMTRADE-FINAL-CA7682021S2[2023-10-30] downloaded
Total of 1 file(s) downloaded
COMTRADE-FINAL-CA7802021S2[2022-06-08] d

In [None]:
#deletes a directory

# import glob
# import os 
# import logging

# logging.basicConfig(level=logging.DEBUG)

# def remove_tmp_dir(tmp_path):
#     """

#     """
#     for f in glob.glob(os.path.join(tmp_path, "*.gz")):
#         try:
#             os.remove(f)
#         except OSError as e:
#             logging.info(f"Error: {f} : {e.strerror}")

#     try:
#         os.rmdir(tmp_path)
#     except OSError as e:
#         logging.info(f"Error: {tmp_path} : {e.strerror}")

In [None]:
#check for any corrupted downloaded files given classification code and range of years

# import pandas as pd
# import os
# import glob

# years = [2012, 2020]

# corrupted_files = []  # List to store names of corrupted files
# classification_code = "S2"

# output_dir="/n/hausmann_lab/lab/atlas/data/"

# for year in range(1962, 2018):
#     raw_files_path = os.path.join(output_dir, "raw", classification_code, str(year))
#     print("YEAR ", year)
#     for file in glob.glob(os.path.join(raw_files_path, "*.gz")):
#         try:
#             # Try reading the CSV file with Pandas
#             pd.read_csv(file)
#         except Exception as e:
#             # If an exception occurs, consider the file corrupted and add its name to the list
#             print(f"Error reading '{file}': {e}")
#             corrupted_files.append(file)

# if corrupted_files:
#     print("Corrupted files:")
#     for corrupted_file in corrupted_files:
#         print(corrupted_file)
# else:
#     print("No corrupted files found.")