In [31]:
import pandas as pd
from datetime import date

df_export = pd.read_csv("TW Export 2008 to 2023 Mar.csv")
df_import = pd.read_csv("TW Import 2008 to 2023 Mar.csv")
df_iso = pd.read_csv("TW_ISO_index.csv")

# create a function to add new column call "Date" and add corresponding date
def update_date_format(df, col_name):
    for index, values in df.iterrows():
        year = str(df.loc[index, col_name]).split("/")[0]
        month = str(df.loc[index, col_name]).split("/")[1]
        if len(month) == 1:
            month = '0' + str(month)
        df.loc[index,"DATE"] = year + '-' + month + '-' + '01'
    df.drop(columns= [col_name], axis=0, inplace=True)
    return df

# define function to change HS code column name and remove symbols in the rows 
# remember to have amount1 unit1 amount 2 unit2 order.    
def update_hs_code(df, col_name):
    df[col_name] = df[col_name].astype(str).str.replace("00000", "").astype(int)
    df.rename(columns={col_name: "HS_CODE"}, inplace=True)
    return df

# add ISO code to dataframe using left join on country code column
def update_iso(df, country_col):
# add country for import file iso code from iso csv file 
    df = df.merge(df_iso[['Country_Name', 'ISO']], left_on=country_col, right_on='Country_Name', how='left')
    return df

# update weight and value column, removing "," and multiplying with 1000 to value then update column name
def update_weight_value(df, quantity, weight, value):
    df[quantity] = df[quantity].astype(str).str.replace(",", "").astype(int)
    df[weight] = df[weight].astype(str).str.replace(",", "").astype(int)
    df[value] = df[value].astype(str).str.replace(",", "").astype(int)
    df[value] = df[value] * 1000
    df.rename(columns={quantity: "QUANTITY_PCS",
                       weight: f"WEIGHT_KG",
                       value: f"VALUE_USD"}, inplace = True)
    return df

# remove unuse columns and sort column order
def remove_columns(df):
    col_order = ["DATE", "HS_CODE", "ISO", "QUANTITY_PCS", "WEIGHT_KG", "VALUE_USD"]
    df = df[col_order]
    return df

# # update date format using function
df_export = update_date_format(df_export, "Time")
df_import = update_date_format(df_import, "Time")

# # update hs code using function
df_export = update_hs_code(df_export, 'Commodity Code')
df_import = update_hs_code(df_import, 'Commodity Code')

# # update country iso code using function
df_export = update_iso(df_export, "Country(Area)")
df_import = update_iso(df_import, "Country(Area)")

# # update Quantity, Weight, Value (multiply by 1000) using function
df_export = update_weight_value(df_export, "Quantity", "Weight(KGM)", "Value(USD$ 1,000)")
df_import = update_weight_value(df_import, "Quantity", "Weight(KGM)", "Value(USD$ 1000)")

# # remove unused columns and sort the column titles
df_export = remove_columns(df_export)
df_import = remove_columns(df_import)

# # check result
# df_import

# # export csv for both export and import data frame
df_export.to_csv(f"DB_TW_EXPORT_UPDATED_2023_APR_{date.today().strftime('%Y%m%d')}01.csv", index=False)
df_import.to_csv(f"DB_TW_IMPORT_UPDATED_2023_APR_{date.today().strftime('%Y%m%d')}01.csv", index=False)


In [30]:
df_import[df_import["DATE"] == '2023-04-01']

Unnamed: 0,Imports/Exports,Commodity Code,Description of Good,Country(Area),Value(USD$ 1000),Weight(KGM),Quantity,Unit of Quantity,DATE
5342,Imports,85235100000,Solid-state non-volatile storage devices,Australia,0,0,1,PCE,2023-04-01
5343,Imports,85235100000,Solid-state non-volatile storage devices,Austria,3,1,2,PCE,2023-04-01
5344,Imports,85235100000,Solid-state non-volatile storage devices,Brazil,39,12,762,PCE,2023-04-01
5345,Imports,85235100000,Solid-state non-volatile storage devices,Canada,42,2,33,PCE,2023-04-01
5346,Imports,85235100000,Solid-state non-volatile storage devices,China,33388,47997,7536040,PCE,2023-04-01
5347,Imports,85235100000,Solid-state non-volatile storage devices,Czech Republic,1,0,4,PCE,2023-04-01
5348,Imports,85235100000,Solid-state non-volatile storage devices,Denmark,0,0,1,PCE,2023-04-01
5349,Imports,85235100000,Solid-state non-volatile storage devices,Estonia,3,0,2,PCE,2023-04-01
5350,Imports,85235100000,Solid-state non-volatile storage devices,Finland,7,0,4,PCE,2023-04-01
5351,Imports,85235100000,Solid-state non-volatile storage devices,France,16,223,3893,PCE,2023-04-01


In [25]:
df_export.ISO.isna().sum()

df_export[df_export["ISO"].isna()]

Unnamed: 0,Imports/Exports,HS_CODE,Description of Good,Country(Area),"Value(USD$ 1,000)",Weight(KGM),Quantity,Unit of Quantity,DATE,Country_Name,ISO


In [11]:
# df_import.head()
# cleaning import file
for index, value in df_import.iterrows():
    df_import.loc[index,"Time"] = str(df_import.loc[index,"Time"]).split("/")[0] +\
                                    "-" +  str(df_import.loc[index,"Time"]).split("/")[1] +\
                                    "-1"

df_import.to_csv("Taiwan_Import_Data_2008_to_2023_2023052201.csv", index=False)