In [98]:

import pandas as pd
from datetime import date

# import source data file and country iso file
df_iso = pd.read_csv("KR_ISO_index.csv") 
df_title = pd.read_csv("Korea HS CODE by Country 2023051401.csv", on_bad_lines='skip', nrows=5)
df_context = pd.read_csv("Korea HS CODE by Country 2023051401.csv", header=4)
# remove first total row after imported by subsetting the first row
df_context = df_context.iloc[1:,:]

# get units. Note: the unit column is fixed, if the source unit column change it would need to 
# change accordingly
def get_units(df):
    currency_unit = df["Unnamed: 8"][2].split(":")[1].split(" ")[0]
    currency_base = int(df["Unnamed: 8"][2].split(":")[1].split(" ")[1].replace(",", ""))
    weight_unit = df["Unnamed: 8"][2].split(":")[1].split(" ")[3].upper()
    return (currency_unit, currency_base, weight_unit)

# change date format and rename the column title
def update_date_format(df, col_name):
    df[col_name] = df[col_name].str.replace(".", "-") + '-01'
    df.rename(columns={col_name : "DATE"}, inplace=True)
    return df

# change HS code title and remove "." in rows
def update_hs_code_title(df, col_name):
    df[col_name] = df[col_name].astype(str).str.replace(".0", "")
    df.rename(columns={col_name: "HS_CODE"}, inplace=True)
    return df

# add ISO code to dataframe using left join on country code column
def update_iso(df,country_col):
# add country for import file iso code from iso csv file 
    df = df.merge(df_iso[['Country_Name', 'ISO']], left_on=country_col, right_on='Country_Name', how='left')
    # df.rename(columns={'ISO Code': 'ISO'}, inplace=True)
    return df

# update weight and value column, removing "," and multiplying with 1000 to value then update column name
def update_weight_value(df, weight, value):
    df[weight] = df[weight].str.replace(",", "").astype(int)
    df[value] = df[value].str.replace(",", "").astype(int)
    df[value] = df[value] * currency_base
    df.rename(columns={weight: f"WEIGHT_{weight_unit}",
                       value: f"VALUE_{currency_unit}"}, inplace = True)
    return df

# split import and export two set of dataframe
### Note: the column names are hard coded
def split_import_export_df(df, import_weight, import_value, export_weight, export_value):
    df_export = df[["DATE", "HS_CODE", "ISO", export_weight, export_value]]
    df_import = df[["DATE", "HS_CODE", "ISO", import_weight, import_value]]
    return (df_export, df_import)
    
# to remove rows with empty values
def remove_empty_values_rows(df):
     df_filtered = df[df['VALUE_USD'] != 0]
     return df_filtered

# get units
currency_unit, currency_base, weight_unit = get_units(df_title)

# update date using function
df_context = update_date_format(df_context, "PRIOD")
#update hs code
df_context = update_hs_code_title(df_context, "H.S Code")
# update iso code
df_context = update_iso(df_context, "Country")

# split import and export two data frames using function
df_export, df_import = split_import_export_df(df_context, 
                                              "Import Weight", 
                                              "Import Value",
                                              "Export Weight",
                                              "Export Value")

# update weight and value for export data frame
df_export = update_weight_value(df_export, "Export Weight", "Export Value")

# update weight and value for import data frame
df_import = update_weight_value(df_import, "Import Weight", "Import Value")

# clean rows with empty values for export data frame
df_export = remove_empty_values_rows(df_export)

# clean rows with empty values for import data frame
df_import = remove_empty_values_rows(df_import)

# check result
df_export.head()

# export csv for both export and import data frame
df_export.to_csv(f"DB_KR_EXPORT_UPDATED_2023_APR_{date.today().strftime('%Y%m%d')}01.csv", index=False)
df_import.to_csv(f"DB_KR_IMPORT_UPDATED_2023_APR_{date.today().strftime('%Y%m%d')}01.csv", index=False)


  df[col_name] = df[col_name].str.replace(".", "-") + '-01'
  df[col_name] = df[col_name].astype(str).str.replace(".0", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[weight] = df[weight].str.replace(",", "").astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[value] = df[value].str.replace(",", "").astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

In [96]:
# df_context["PRIOD"] = df_context["PRIOD"].str.replace(".", "-") + '-01'

df_import.tail(20)

Unnamed: 0,DATE,HS_CODE,ISO,WEIGHT_KG,VALUE_USD
8780,2023-04-01,852351,FRO,0,21000
8781,2023-04-01,852351,FRA,44,421000
8782,2023-04-01,852351,SGP,171,197000
8784,2023-04-01,852351,SWE,1,1000
8789,2023-04-01,852351,PHL,30817,3911000
8790,2023-04-01,852351,CZE,4,14000
8791,2023-04-01,852351,DEU,1027,839000
8792,2023-04-01,852351,CHN,58896,35845000
8794,2023-04-01,852351,CHE,15,7000
8798,2023-04-01,852351,AUT,21,67000


In [82]:
# df_context[df_context["ISO"] == None]
df_context["ISO"].unique()

array(['JPN', 'GBR', 'CHN', 'FIN', 'MYS', 'TWN', 'HKG', 'KHM', 'ITA',
       'CHL', 'DEU', 'USA', 'TUR', 'MEX', 'RUS', 'SGP', 'ARE', 'AUS',
       'KWT', 'ISR', 'CRI', 'SWE', 'AUT', 'IRN', 'BRA', 'YEM', 'NOR',
       'CHE', 'FRA', 'EGY', 'CAN', 'MAR', 'NLD', 'IND', 'COL', 'PAN',
       'BEL', 'DNK', 'ARG', 'BOL', 'VNM', 'VEN', 'CZE', 'THA', 'IDN',
       'PHL', 'SAU', 'IRL', 'LVA', 'JOR', 'SVK', 'HUN', 'ESP', 'ECU',
       'GTM', 'NGA', 'NZL', 'EST', 'PRI', 'PRY', 'UKR', 'PER', 'BLR',
       'QAT', 'BRN', 'POL', 'GRC', 'DOM', 'NPL', 'LTU', 'ROU', 'CMR',
       'ISL', 'SWZ', 'DZA', 'REU', 'SVN', 'SRB', 'IRQ', 'BEN', 'KAZ',
       'MAC', 'URY', 'BFA', 'SDN', 'AFG', 'CUB', 'BGR', 'MNG', 'CYP',
       'ZAF', 'LBY', 'MMR', 'COD', 'LUX', 'TUN', 'PRT', 'KEN', 'BGD',
       'PAK', 'AZE', 'UZB', 'BHR', 'COG', 'SYR', 'LBN', 'GHA', 'LKA',
       'GEO', 'MDV', 'MKD', 'MUS', 'OMN', 'HND', 'MCO', 'TTO', 'SOM',
       'ZMB', 'MRT', 'BIH', 'ALB', 'DJI', 'ERI', 'GIN', 'CIV', 'SEN',
       'TCD', 'UGA',

In [94]:
print(f"df_export - shape {df_export.shape}")
print(f"df_import - shape {df_import.shape}")

df_export - shape (8825, 5)
df_import - shape (8825, 5)


In [97]:
print(f"df_export - shape {df_export.shape}")
print(f"df_import - shape {df_import.shape}")

df_export - shape (6254, 5)
df_import - shape (4582, 5)
