In [3]:
import os
import pandas as pd
import numpy as np
import scipy.stats
# Import other necessary libraries

In [None]:
class _AtlasCleaning(object):
    # Classification names & levels
    PRODUCT_CLASSIFICATIONS = ["H0", "HS", "S1", "S2", "ST"]

    HIERARCHY_LEVELS = {
        "hs92": (1, 2, 4, 6),
        "hs12": (1, 2, 4, 6),
        "sitc": (1, 2, 4),
        "services": (1, 2, 4, 6),
    }

    REGIONAL_GROUP_TYPES = ["world", "region", "subregion"]

    INGESTION_OUTPUT_FORMATS = ["parquet", "hdf5"]

    def __init__(
        self,
        start_year,
        end_year,
        classification,
        schema,
        root_dir,
        output_dir,
    ):
        # INPUTS
        self.root_dir = data_dir
        self.classification_dir = os.path.join(self.root_dir, "classification")
        self.output_dir = output_dir

        self.df = None
        self.latest_year = latest_year
        self.earliest_year = latest_year - data_coverage_from_latest_year
        self.limit_data_coverage = limit_data_coverage
        self.product_classification = product_classification
        self.schema = schema


    def load_parquet(
        self,
        table_name: str,
        schema: typing.Optional[str] = None,
    ):
        if schema is not None:
            read_dir = os.path.join(self.root_dir, schema)
        else:
            read_dir = os.path.join(self.root_dir)

        df = pd.read_parquet(os.path.join(read_dir, f"{table_name}.parquet"))
        if self.limit_data_coverage and "year" in df.columns:
            if schema == "sitc":
                df = df[df.year >= (self.latest_year - 25)]
            else:
                df = df[df.year >= self.earliest_year]

        return df

    def save_parquet(self, table_name: str):
        if self.schema is not None:
            save_dir = os.path.join(self.output_dir, self.schema)
        else:
            save_dir = os.path.join(self.output_dir)

        os.makedirs(save_dir, exist_ok=True)
        save_path = os.path.join(save_dir, f"{table_name}.parquet")

        self.df.to_parquet(save_path, index=False)


In [None]:
# Set path
path = "/n/hausmann_lab/lab/atlas/bustos_yildirim/atlas_stata_cleaning/src"
data_path = os.path.join(path, "data")
program_path = os.path.join(path, "clean", "comtrade_reads_zip.py")
rfile_path = os.path.join(raw_data_path, f"{classification}_{year}.zip")


# Define years
startyear = 2015
finalyear = 2015

In [None]:
def load_data(classification, year):


In [None]:

# Function to clean data
def cleandata(classification, year):
    print(f"   > {year} and classification = {classification}")
    
    # Change directory to specific classification raw data and initialize paths
    raw_data_path = os.path.join(path, f"data/raw/{classification}_raw")
    program_path = os.path.join(path, "clean", "comtrade_reads_zip.py")
    rfile_path = os.path.join(raw_data_path, f"{classification}_{year}.zip")
    
    # Load the temporary data
    df = pd.read_stata(os.path.join(path, "data/processed/prepped_{classification}_{year}.dta"))

    # Data manipulation based on 'classification'
    # This is a simplification, adapt based on actual logic and conditions in Stata script
    if classification in ["H0", "HS"]:
        # Add "00" prefix to 'commoditycode' where the length of 'commoditycode' is 4 and 'agglevel' is 6
        df['commodity_code'] = np.where((df['commodity_code'].str.len() == 4) & (df['agglevel'] == 6), '00' + df['commodity_code'], df['commodity_code'])
        # Add "0" prefix to 'commoditycode' where the length of 'commoditycode' is 5 and 'agglevel' is 6
        df['commodity_code'] = np.where((df['commodity_code'].str.len() == 5) & (df['agglevel'] == 6), '0' + df['commodity_code'], df['commodity_code'])
        df['reporter_ansnoclas'] = df.trade_value.where((df.partner_iso=="ANS") & (df.agglevel == 4) & (df.commodity_code.str.slice(0, 4) == "9999"))

    elif classification in ["S1", "S1", "ST"]:
        # Add "00" prefix to 'commoditycode' where the length of 'commoditycode' is 2 and 'agglevel' is 4
        df['commodity_code'] = np.where((df['commodity_code'].str.len() == 2) & (df['agglevel'] == 4), '00' + df['commodity_code'], df['commodity_code'])
        # Add "0" prefix to 'commoditycode' where the length of 'commoditycode' is 3 and 'agglevel' is 4
        df['commodity_code'] = np.where((df['commodity_code'].str.len() == 3) & (df['agglevel'] == 4), '0' + df['commodity_code'], df['commodity_code'])
        df['reporter_ansnoclas'] = df.trade_value.where((df.partner_iso=="ANS") & (df.agglevel == 4) & (df.commodity_code == "9310"))

    # handles Germany (reunification) and Russia
    # drop if reporter and partner are DEU and DDR, trading with itself
    df = df[~((df['reporter_iso'] == "DEU") & (df['partner_iso'] == "DDR"))]
    df = df[~((df['reporter_iso'] == "DDR") & (df['partner_iso'] == "DEU"))]
    df.loc[df['partner_iso'].isin(["DEU", "DDR"]), 'partner_iso'] = "DEU"
    df.loc[df['reporter_iso'].isin(["DEU", "DDR"]), 'reporter_iso'] = "DEU"
    df.loc[df['partner_iso'].isin(["RUS", "SUN"]), 'partner_iso'] = "RUS"
    df.loc[df['reporter_iso'].isin(["RUS", "SUN"]), 'reporter_iso'] = "RUS"

    #compress
    #collapse (sum) tradevalue reporter_ansnoclas , by( year tradeflow agglevel reporter_iso partner_iso )
    df_collapsed = df.groupby(['year', 'tradeflow', 'agglevel', 'reporter_iso', 'partner_iso']).agg({'tradevalue': 'sum', 'reporter_ansnoclas': 'sum'}).reset_index()
    #recast float tradevalue reporter_ansnoclas, force
    df_collapsed['tradevalue'] = df_collapsed['tradevalue'].astype('float')
    df_collapsed['reporter_ansnoclas'] = df_collapsed['reporter_ansnoclas'].astype('float')

    # Return the cleaned DataFrame
    return df_collapsed


# Loop over years and classifications, calling the cleandata function
for year in range(startyear, finalyear + 1):
    print(f"> Doing year = {year}")

    # Example classifications processing, add as necessary
    if 1976 <= year < 1995:
        classification = "S2"
        cleaned_data = cleandata(classification, year)
        # Save cleaned data to file, replace `trade_S2` with actual path or filename
        cleaned_data.to_csv(os.path.join(path, f"Totals_raw_{year}.csv"), index=False)

    # Continue with other classification and year conditions

# After processing all years and classifications, you might want to merge, manipulate,
# and analyze the cleaned datasets similarly to how it's done in the Stata script.