In [1]:
import numpy as np
import pandas as pd
import os
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 2000)

In [2]:
# Create directories
current_wd = os.getcwd() #Path of current working directory
try:
    os.mkdir('{}/post_processed_data'.format(os.path.dirname(current_wd)))
except:
    pass

# Creating Final DataFrame

In [3]:
df_2016 = pd.read_csv("../processed_data/df_2016.csv", low_memory=False, 
                      usecols = ['HSCODE', 'COUNTRYEXPORT', 'CUSTOMSVALUE', 
                                 'CURRENCY', 'EXCHANGE_RATE',
                                 'VAT_PAID', 'DUTIESTAXES', 
                                 'GOODSDESCRIPTION', 'month', 'year'])
df_2017 = pd.read_csv("../processed_data/df_2017.csv", low_memory=False,
                      usecols = ['HSCODE', 'COUNTRYEXPORT', 'CUSTOMSVALUE', 
                                 'CURRENCY', 'EXCHANGE_RATE',
                                 'VAT_PAID', 'DUTIESTAXES', 
                                 'GOODSDESCRIPTION', 'month', 'year'])
df_2018 = pd.read_csv("../processed_data/df_2018.csv", low_memory=False,
                      usecols = ['HSCODE', 'COUNTRYEXPORT', 'CUSTOMSVALUE', 
                                 'CURRENCY', 'EXCHANGE_RATE',
                                 'VAT_PAID', 'DUTIESTAXES', 
                                 'GOODSDESCRIPTION', 'month', 'year'])                      
df_2019 = pd.read_csv("../processed_data/df_2019.csv", low_memory=False,
                      usecols = ['HSCODE', 'COUNTRYEXPORT', 'CUSTOMSVALUE', 
                                 'CURRENCY', 'EXCHANGE_RATE',
                                 'VAT_PAID', 'DUTIESTAXES', 
                                 'GOODSDESCRIPTION', 'month', 'year'])                      
df_2020 = pd.read_csv("../processed_data/df_2020.csv", low_memory=False,
                      usecols = ['HSCODE', 'COUNTRYEXPORT', 'CUSTOMSVALUE', 
                                 'CURRENCY', 'EXCHANGE_RATE',
                                 'VAT_PAID', 'DUTIESTAXES', 
                                 'GOODSDESCRIPTION', 'month', 'year'])                      

### Combine all years

In [4]:
df_all = df_2016.append([df_2017, df_2018, df_2019, df_2020])
print(df_all.shape)
df_all.head()

(13662638, 10)


Unnamed: 0,HSCODE,COUNTRYEXPORT,CUSTOMSVALUE,CURRENCY,EXCHANGE_RATE,VAT_PAID,DUTIESTAXES,GOODSDESCRIPTION,month,year
0,38112900000,ITALY,73191.19,USD,45.782,470418.0,826078.0,LUBAD 1478,November,2016
1,87163999000,THAILAND,39600.0,USD,47.182,238570.0,296026.0,CAR TRAILER,November,2016
2,87163999000,CHINA,59400.0,USD,47.182,357477.0,443661.0,CAR TRAILER,November,2016
3,87163999000,CHINA,39600.0,USD,47.182,238570.0,296026.0,CAR TRAILER,November,2016
4,27111900000,CHINA,227495.55,USD,46.705,1623136.0,1623136.0,505.00 MT LIQUEFIED PETROLEUM GAS,November,2016


### Add and remove columns

In [5]:
# Create goods categories
import numpy as np
def HS_categorize(hs):
    hs_len = len(hs)
    val = hs[:2-(11-hs_len)]
    val = int(val)
    
    if val <  6:
        cat = 'Animal & Animal Products'
    elif val < 16:
        cat = "Vegetable Products"
    elif val < 25:
        cat = "Foodstuffs"
    elif val < 28:
        cat = "Mineral Products"
    elif val < 39:
        cat = "Chemicals & Allied Industries"
    elif val < 41:
        cat = "Plastics / Rubbers"
    elif val < 44:
        cat = "Raw Hides, Skins, Leather, & Furs"
    elif val < 50:
        cat = "Wood & Wood Products"
    elif val < 64:
        cat = "Textiles"
    elif val < 68:
        cat = "Footwear / Headgear"
    elif val < 72:
        cat = "Stone / Glass"
    elif val < 84:
        cat = "Metals"
    elif val < 86:
        cat = "Machinery / Electrical"
    elif val < 90:
        cat = "Transportation"
    elif val < 98:
        cat = "Miscellaneous"
    else:
        cat = ''    
    return cat

In [6]:
# add column for HSCode major category
df_all['category'] = (df_all['HSCODE'].astype(str).apply(lambda x: HS_categorize(x)))

In [7]:
# Correct months
df_all["month"] = df_all["month"].str.replace(r"Jun$", "June")
df_all["month"] = pd.to_datetime(df_all["month"], format='%B').dt.month

In [8]:
# Add column for customs value in PHP
df_all["CUSTOMSVALUEPHP"] = (df_all["CUSTOMSVALUE"] * 
                             df_all["EXCHANGE_RATE"])

In [9]:
# Remove outlier
df_final = df_all[df_all["VAT_PAID"] != 538483406146]

In [10]:
# Check final data
print(f'DataFrame size:\n {df_final.shape}', "\n\n",
      f'Columns:\n {df_final.columns}', "\n\n",
      f'Missing values:\n{df_final.isna().sum()}')

DataFrame size:
 (13662638, 12) 

 Columns:
 Index(['HSCODE', 'COUNTRYEXPORT', 'CUSTOMSVALUE', 'CURRENCY', 'EXCHANGE_RATE',
       'VAT_PAID', 'DUTIESTAXES', 'GOODSDESCRIPTION', 'month', 'year',
       'category', 'CUSTOMSVALUEPHP'],
      dtype='object') 

 Missing values:
HSCODE                    0
COUNTRYEXPORT         31845
CUSTOMSVALUE              0
CURRENCY                  0
EXCHANGE_RATE             0
VAT_PAID                  0
DUTIESTAXES               0
GOODSDESCRIPTION    1187408
month                     0
year                      0
category                  0
CUSTOMSVALUEPHP           0
dtype: int64


In [11]:
# Save to csv
# df_final.to_csv("df_complete.csv")

# Creating Sub-DataFrames

### Dataframe of Customs Value in PHP by Category and Country

In [14]:
# Create the DataFrame
df_cat = (df_final.groupby(["category", "COUNTRYEXPORT"])["CUSTOMSVALUEPHP"]
                  .sum()
                  .to_frame()
                  .sort_values(by=["category", "CUSTOMSVALUEPHP"], 
                               ascending=False)
                  .reset_index())

# Save the DataFrame to CSV
df_cat.to_csv('../post_processed_data/df_categories.csv')

In [15]:
print(df_cat.shape)
df_cat.head()

(1891, 3)


Unnamed: 0,category,COUNTRYEXPORT,CUSTOMSVALUEPHP
0,Wood & Wood Products,CHINA,92128420000.0
1,Wood & Wood Products,JAPAN,41428080000.0
2,Wood & Wood Products,UNITED STATES,36600460000.0
3,Wood & Wood Products,INDONESIA,35006510000.0
4,Wood & Wood Products,CANADA,31405740000.0


### Dataframe of Customs Value Comparison by Category

In [16]:
# Create dataframe without China
df_nc = df_final[df_final["COUNTRYEXPORT"] != "CHINA"]

# Get grouped value per category
cat_tax = df_final.groupby("category")["CUSTOMSVALUEPHP"].sum().to_frame()
nc_tax  = df_nc.groupby("category")["CUSTOMSVALUEPHP"].sum().to_frame()
add_tot = pd.DataFrame({"w_China" : df_final["CUSTOMSVALUEPHP"].sum(), 
                        "wo_China" : df_nc["CUSTOMSVALUEPHP"].sum()},
                        index=["total"])

# Combine columns to one dataframe
df_comp = cat_tax.merge(nc_tax, on="category")
df_comp.columns = ["w_China", "wo_China"]
df_comp = df_comp.append(add_tot)

# Add column for difference
df_comp["diff"] = ((df_comp["w_China"] - 
                    df_comp["wo_China"]) / 
                    df_comp["w_China"]*100)
df_comp = df_comp.reset_index().rename(columns={'index':'category'})

# Save the DataFrame to CSV
df_comp.to_csv("../post_processed_data/df_china_comparison.csv")

In [17]:
print(df_comp.shape)
df_comp.head()

(16, 4)


Unnamed: 0,category,w_China,wo_China,diff
0,Animal & Animal Products,534085100000.0,501340000000.0,6.131062
1,Chemicals & Allied Industries,1508835000000.0,1260627000000.0,16.450297
2,Foodstuffs,1052901000000.0,957461600000.0,9.064444
3,Footwear / Headgear,86397570000.0,31734220000.0,63.269539
4,Machinery / Electrical,7318975000000.0,6004644000000.0,17.95785


### Dataframe of Category by Customs Value

In [18]:
# Group categories by the customs value in pesos.
most_imported_category = (df_final.groupby('category')['CUSTOMSVALUEPHP']
                                  .sum()
                                  .sort_values(ascending=False))
# Saves dataframe to CSV
most_imported_category.to_csv(
    "../post_processed_data/most_imported_by_value.csv")

## Word Clouds Pre-processing

In [19]:
# Choose columns from main DataFrame
df_wc_prep = (df_final[['category', 'HSCODE', 'GOODSDESCRIPTION']]
              .reset_index(drop=True).copy())
display(df_wc_prep)

Unnamed: 0,category,HSCODE,GOODSDESCRIPTION
0,Chemicals & Allied Industries,38112900000,LUBAD 1478
1,Transportation,87163999000,CAR TRAILER
2,Transportation,87163999000,CAR TRAILER
3,Transportation,87163999000,CAR TRAILER
4,Mineral Products,27111900000,505.00 MT LIQUEFIED PETROLEUM GAS
...,...,...,...
13662633,Mineral Products,27111900000,LPG MIXTURES
13662634,Mineral Products,27111900000,LPG MIXTURES
13662635,Mineral Products,27011290000,"71,499 MT OF PINANG COAL"
13662636,Mineral Products,27011290000,"69,950 MT OF PINANG COAL"


In [20]:
# Create H6 reference
hscode = (pd.read_csv('../processed_data/df_2017_complete.csv',
                      usecols=['HSCODE6'])
          .dropna(how='all', axis=0).drop_duplicates())

hscode = (hscode.merge(hscode['HSCODE6']
                       .apply(lambda s: pd.Series({'H6': s[:6],
                                                   'H6DESC': s[7:]})),
                       left_index=True, right_index=True))

del hscode['HSCODE6']

h6 = hscode[['H6', 'H6DESC']].drop_duplicates().reset_index(drop=True).copy()

h6['H6'] = h6['H6'].astype(str)

# Get H6 Description for main data
dfHS = (df_wc_prep['HSCODE'].astype(str).apply(lambda x: x.zfill(11)[:6])
        .copy().rename('H6').to_frame())

dfHS['H6'] = dfHS['H6'].astype(int)
h6['H6'] = h6['H6'].astype(int)

dfHS = (dfHS.merge(h6, how='left', on='H6')
        .join(df_wc_prep)
        .set_index(['category']))

del dfHS['H6']
del dfHS['HSCODE']
display(dfHS)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0_level_0,H6DESC,GOODSDESCRIPTION
category,Unnamed: 1_level_1,Unnamed: 2_level_1
Chemicals & Allied Industries,Miscellaneous chemical products Anti-knock pre...,LUBAD 1478
Transportation,Vehicles others than railway or tramway rollin...,CAR TRAILER
Transportation,Vehicles others than railway or tramway rollin...,CAR TRAILER
Transportation,Vehicles others than railway or tramway rollin...,CAR TRAILER
Mineral Products,"Mineral fuels, mineral oils and products of th...",505.00 MT LIQUEFIED PETROLEUM GAS
...,...,...
Mineral Products,"Mineral fuels, mineral oils and products of th...",LPG MIXTURES
Mineral Products,"Mineral fuels, mineral oils and products of th...",LPG MIXTURES
Mineral Products,"Mineral fuels, mineral oils and products of th...","71,499 MT OF PINANG COAL"
Mineral Products,"Mineral fuels, mineral oils and products of th...","69,950 MT OF PINANG COAL"


In [21]:
# WordCloud Libraries and Stopwords
from nltk.corpus import stopwords
import nltk
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import string

# Retrieve Stopwords
mystopwords = list(STOPWORDS)
nltk.download('stopwords')
stopwords_nltk = set(stopwords.words("english"))
mystopwords = mystopwords + list(stopwords_nltk) + ['articles']
mystopwords = set(mystopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/msds2021/jpangan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [22]:
# Create word count dictionary (H6)
from collections import Counter
import string

# Create an empty Dataframe with column names only
dfH6 = pd.DataFrame(columns=['category', 'word', 'count'])
dfH6.set_index(['category', 'word'], inplace=True)

# Collect and Append Counts to DataFrame
for idx, midx in enumerate(dfHS.index.unique()):

    s = (dfHS.loc[midx]['H6DESC'].to_string(index=False, header=False)
         .lower().translate(str.maketrans('', '', string.punctuation))
         .split())

    ds = (dict((k, v) for k, v in Counter(s).items() if
               ((k not in list(mystopwords)) and (len(k) > 3) and
                (k.isnumeric() == False))))

    ds = sorted(ds.items(), key=lambda x: (-x[1], x[0]))

    ds = pd.DataFrame(ds, columns=['word', 'count'])

    ds['category'] = midx

    ds.set_index(['category', 'word'], inplace=True)

    dfH6 = pd.concat([dfH6, ds])

In [23]:
# Output to CSV file
dfH6.to_csv('../post_processed_data/WCData_H6.csv')

In [24]:
# Create word count dictionary (GD)
from collections import Counter
import string

# Creating an empty Dataframe with column names only

dfGD = pd.DataFrame(columns=['category', 'word', 'count'])
dfGD.set_index(['category', 'word'], inplace=True)

# Collect and Append Counts to DataFrame
for idx, midx in enumerate(['Machinery / Electrical', 
                            'Mineral Products', 'Transportation']):

    s = (dfHS.loc[midx]['GOODSDESCRIPTION']
         .to_string(index=False, header=False)
         .lower()
         .translate(str.maketrans('', '', string.punctuation))
         .split())

    ds = (dict((k, v) for k, v in Counter(s).items() if
               ((k not in list(mystopwords)) and (len(k) > 3) and
                (k.isnumeric() == False))))

    ds = sorted(ds.items(), key=lambda x: (-x[1], x[0]))

    ds = pd.DataFrame(ds, columns=['word', 'count'])

    ds['category'] = midx

    ds.set_index(['category', 'word'], inplace=True)

    dfGD = pd.concat([dfGD, ds])

In [25]:
# Output to CSV file
dfGD.to_csv('../post_processed_data/WCData_GD.csv')