In [1]:
import os
import re
import pandas as pd

## Data Preparation

Perform data manipulation on the csv files in the path `../converted_csv_data`. The scope of the data manipulation are as follows:

- Append the data in scope per year
- Add month and year column per file
- Store the files in `../post_processed_data`

In [2]:
path = '../converted_csv_data/'
output = '../processed_data/'

In [3]:
# Create directories
current_wd = os.getcwd() #Path of current working directory
try:
    os.mkdir('{}/processed_data'.format(os.path.dirname(current_wd)))
except:
    pass


In [4]:
csv = os.listdir('../converted_csv_data/')
csv = [file for file in csv if '.csv' in file]
len(csv)

93

In [5]:
csv_dict = dict()
for file in csv:
    filepath = path + file
    pattern = r'Customs_Data_[-\w ]*-(\d{4})'
    pattern2 = r'Customs_Data_([-\w ]*)-'
    csv_dict[file] = {'file': path+file,
                      'year': re.findall(pattern, file)[0],
                      'month': re.findall(pattern2, file)[0]}

In [6]:
# Convert dictionary to a dataframe
csv_df = pd.DataFrame()
csv_df = csv_df.from_dict(csv_dict, orient='index').reset_index()
csv_df['year'] = csv_df['year'].astype(int)

In [7]:
csv_df.groupby('year').size()

year
2012     4
2013     8
2014    17
2015    12
2016    12
2017    12
2018    12
2019    12
2020     4
dtype: int64

### For the purposes of this project, we will only include 2016 up to 2020 data which is a total of 52 files.

In [8]:
scope = csv_df[csv_df['year']>=2016]
scope = scope.drop('index', axis=1)
scope = scope.sort_values('year')
scope = scope.reset_index(drop=True)

In [9]:
for idx, file in enumerate(scope['file']):
    temp = pd.read_csv(file, nrows=100).dropna(axis=1, how='all')
    temp_columns = len(temp.columns)
    scope.loc[idx,'num_of_columns'] = temp_columns
    


In [10]:
def append_df(files):
    """
    Accepts a list of files in `files` and returns an appended
    `pandas` `DataFrame` of all files in `files`

    Adds month and year data in the columns.
    """
    df_append = pd.DataFrame()
    for i in range(len(files)):
        df = (pd.read_csv(files[i]).dropna(axis=1, how='all')
                                   .dropna(axis=0, how='all'))
        df['month'] = scope.loc[scope['file'] == files[i], 'month'].values[0]
        df['year'] = scope.loc[scope['file'] == files[i], 'year'].values[0]
        if i == 0:
            df_append = df
        else:
            df_append = df_append.append(df, ignore_index=True)
    return df_append

In [23]:
# Create a csv file for 2016 data and saves it to ../processed_data
month = ["July", "August", "September", "October", "November", "December"]
files_2016 = scope[(scope['year']==2016) &
                   (scope['month'].isin(month))]['file'].to_list()
df = append_df(files_2016)

In [24]:
# Post processing of 2016 data

# Read the following columns and rename them for consistency
df = df[['HSCODE', 'COUNTRYORIGIN', 'COUNTRYEXPORT',
         'GROSSMASS', 'NETMASS', 'CUSTOMSVALUE',
         'CURRENCY', 'DUTIABLE_FOREIGN', 'EXCHANGE_RATE',
         'DUTIABLEVALUE_PHP', 'FREIGHT', 'INSURANCE',
         'ARRASTRE', 'PREF_CODE', 'DUTY_PAID', 'VAT_BASE',
         'VAT_PAID', 'EXCISE_ADVALOREM_PAID', 'DUTIESTAXES',
         'GOODS_DESCRIPTION', 'month', 'year']]

df.columns = ['HSCODE', 'COUNTRYORIGIN', 'COUNTRYEXPORT', 
              'GROSSMASSKGS', 'NETMASSKGS', 'CUSTOMSVALUE', 
              'CURRENCY', 'DUTIABLE_FOREIGN', 'EXCHANGE_RATE', 
              'DUTIABLEVALUEPHP', 'FREIGHT', 'INSURANCE', 
              'ARRASTRE', 'PREF_CODE', 'DUTY_PAID', 'VATTAXBASE',
              'VAT_PAID', 'EXCISEADVALOREM', 'DUTIESTAXES', 
              'GOODSDESCRIPTION', 'month', 'year']

df['HSCODE'] = df['HSCODE'].astype(int)
# Create a df_2016.csv file
output_file = output + 'df_2016.csv'
df.to_csv(output_file, index=False)


In [13]:
# Create a csv file for 2017 data and saves it to ../processed_data
output_file = output + 'df_2017_complete.csv'
files_2017 = scope[scope['year']==2017]['file'].to_list()
append_df(files_2017).to_csv(output_file, index=False)

  if (await self.run_code(code, result,  async_=asy)):


In [14]:
# Post processing of 2017 data
# Read the following columns and rename them for consistency
df = pd.read_csv(output_file,
                 usecols=['HSCODE', 'COUNTRYORIGIN', 'COUNTRYEXPORT',
                          'GROSSMASS', 'NETMASS', 'CUSTOMSVALUE',
                          'CURRENCY', 'DUTIABLE_FOREIGN', 'EXCHANGE_RATE',
                          'DUTIABLEVALUE_PHP', 'FREIGHT', 'INSURANCE',
                          'ARRASTRE', 'PREF_CODE', 'DUTY_PAID', 'VAT_BASE',
                          'VAT_PAID', 'EXCISE_ADVALOREM_PAID', 'DUTIESTAXES',
                          'GOODS_DESCRIPTION', 'month', 'year'])

df = df[['HSCODE', 'COUNTRYORIGIN', 'COUNTRYEXPORT',
         'GROSSMASS', 'NETMASS', 'CUSTOMSVALUE', 
         'CURRENCY', 'DUTIABLE_FOREIGN', 'EXCHANGE_RATE', 
         'DUTIABLEVALUE_PHP', 'FREIGHT', 'INSURANCE', 
         'ARRASTRE', 'PREF_CODE', 'DUTY_PAID', 'VAT_BASE', 
         'VAT_PAID', 'EXCISE_ADVALOREM_PAID', 'DUTIESTAXES',  
         'GOODS_DESCRIPTION', 'month', 'year']]

df.columns=['HSCODE', 'COUNTRYORIGIN', 'COUNTRYEXPORT',
            'GROSSMASSKGS', 'NETMASSKGS', 'CUSTOMSVALUE',
            'CURRENCY', 'DUTIABLE_FOREIGN', 'EXCHANGE_RATE',
            'DUTIABLEVALUEPHP', 'FREIGHT', 'INSURANCE',
            'ARRASTRE', 'PREF_CODE', 'DUTY_PAID', 'VATTAXBASE',
            'VAT_PAID', 'EXCISEADVALOREM', 'DUTIESTAXES',
            'GOODSDESCRIPTION', 'month', 'year']

# Create a df_2017.csv file
output_file=output + 'df_2017.csv'
df.to_csv(output_file, index=False)

  interactivity=interactivity, compiler=compiler, result=result)


In [15]:
# Create a csv file for 2018 data and saves it to ../processed_data
output_file = output + 'df_2018.csv'
files_2018 = scope[scope['year']==2018]['file'].to_list()
append_df(files_2018).to_csv(output_file, index=False)

In [16]:
# Create a csv file for 2019 data and saves it to ../processed_data
files_2019 = scope[scope['year']==2019]['file'].to_list()
df = append_df(files_2019)

In [17]:
# Remove outlier
df.sort_values(by='VAT_PAID', ascending=False).head(1)

Unnamed: 0,HSCODE,COUNTRYORIGIN,COUNTRYEXPORT,GROSSMASSKGS,NETMASSKGS,CUSTOMSVALUE,CURRENCY,DUTIABLE_FOREIGN,EXCHANGE_RATE,DUTIABLEVALUEPHP,...,ARRASTRE,PREF_CODE,DUTY_PAID,VATTAXBASE,VAT_PAID,EXCISEADVALOREM,DUTIESTAXES,GOODSDESCRIPTION,month,year
903471,85331090000,JP,JAPAN,5.3,5.3,85331090000.0,USD,85331090000.0,52.522,4481760000000.0,...,0.0,,0,4487361717889,538483406146,0,0,RESISTOR,May,2019


In [18]:
print(df.shape, '\n')
df = df[df['VAT_PAID']!=538483406146]
print (df.shape)

(3794776, 22) 

(3794775, 22)


In [19]:
# Create a df_2019.csv file
output_file = output + 'df_2019.csv'
df.to_csv(output_file, index=False)

In [20]:
# Create a csv file for 2020 data and saves it to ../processed_data
output_file = output + 'df_2020.csv'
files_2020 = scope[scope['year']==2020]['file'].to_list()
append_df(files_2020).to_csv(output_file, index=False)

  if (await self.run_code(code, result,  async_=asy)):
