# Notebook to explore, clean, and merge our raw data

In [1]:
# pandas library for data manipulation
import pandas as pd 

# numpy library for numerical calculations
import numpy as np

# matplotlib or seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Load the raw data

### Imports data

In [2]:
imports_data = pd.read_csv('../data/raw_data/imports_main.csv')
imports_data.head()

Unnamed: 0,Importer,TEU,Month,Unlading_Port,Origin_Country,HS_Codes
0,Importer_00001,0.849421,1,4601,POLAND,990500
1,Importer_00002,1.0,1,1401,ITALY,330590
2,Importer_00003,2.0,1,5301,BELGIUM,730890
3,Importer_00004,0.035356,1,5301,GERMANY,580620
4,Importer_00005,4.0,1,4601,DENMARK,392030


In [12]:
imports_data.drop(columns=['Importer', 'Unlading_Port'], inplace=True)
imports_data.head()

Unnamed: 0,TEU,Month,Origin_Country,HS_Codes
0,0.849421,1,POLAND,990500
1,1.0,1,ITALY,330590
2,2.0,1,BELGIUM,730890
3,0.035356,1,GERMANY,580620
4,4.0,1,DENMARK,392030


### Country Codes

In [3]:
country_codes = pd.read_csv('../data/raw_data/country_codes.csv')
country_codes.head()

Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,


In [13]:
country_codes = country_codes[['name','alpha-3']]
country_codes.head()

Unnamed: 0,name,alpha-3
0,Afghanistan,AFG
1,Åland Islands,ALA
2,Albania,ALB
3,Algeria,DZA
4,American Samoa,ASM


### Sea Distance

In [6]:
sea_distance = pd.read_excel('../data/raw_data/CERDI-seadistance.xlsx')
sea_distance.head()

Unnamed: 0,iso1,iso2,seadistance,capitalport1,capitalport2,roaddistance,short
0,ABW,AFG,16498.8,,1471.3,,0
1,ABW,AGO,9437.06,,1184.17,,0
2,ABW,AIA,956.853,,,,0
3,ABW,ALB,8790.06,,247.952,,0
4,ABW,AND,7685.42,,453.335,,0


In [14]:
sea_distance = sea_distance[['iso1','iso2','seadistance']]
sea_distance.head()

Unnamed: 0,iso1,iso2,seadistance
0,ABW,AFG,16498.8
1,ABW,AGO,9437.06
2,ABW,AIA,956.853
3,ABW,ALB,8790.06
4,ABW,AND,7685.42


### Tariff Data

In [7]:
tariff_data = pd.read_excel('../data/raw_data/tariff_database_202405.xlsx')
tariff_data.head()

Unnamed: 0,hts8,brief_description,quantity_1_code,quantity_2_code,wto_binding_code,mfn_text_rate,mfn_rate_type_code,mfn_ave,mfn_ad_val_rate,mfn_specific_rate,...,japan_indicator,japan_rate_type_code,japan_ad_val_rate,japan_specific_rate,japan_other_rate,usmca_indicator,usmca_rate_type_code,usmca_ad_val_rate,usmca_specific_rate,usmca_other_rate
0,1012100,Live purebred breeding horses,NO,,B,Free,0,,0.0,0.0,...,,,,,,,,,,
1,1012900,Live horses other than purebred breeding horses,NO,,B,Free,0,,0.0,0.0,...,,,,,,,,,,
2,1013000,Live asses,NO,,B,6.8%,7,,0.068,0.0,...,,,,,,S,0.0,0.0,0.0,0.0
3,1019030,Mules and hinnies imported for immediate slaug...,NO,,B,Free,0,,0.0,0.0,...,,,,,,,,,,
4,1019040,Mules and hinnies not imported for immediate s...,NO,,B,4.5%,7,,0.045,0.0,...,,,,,,S,0.0,0.0,0.0,0.0


### WGI_data

In [8]:
wgi_data = pd.read_excel('../data/raw_data/wgidataset.xlsx')
wgi_data.head()

Unnamed: 0,codeindyr,code,countryname,year,indicator,estimate,stddev,nsource,pctrank,pctranklower,pctrankupper
0,AFGcc1996,AFG,Afghanistan,1996,cc,-1.291705,0.340507,2,4.301075,0,27.419355
1,ALBcc1996,ALB,Albania,1996,cc,-0.893903,0.315914,3,19.354839,2.688172,43.010754
2,DZAcc1996,DZA,Algeria,1996,cc,-0.566741,0.262077,4,33.333332,16.666666,52.688171
3,ASMcc1996,ASM,American Samoa,1996,cc,..,..,..,..,..,..
4,ADOcc1996,ADO,Andorra,1996,cc,1.318143,0.480889,1,87.096771,72.043015,96.774193


In [15]:
wgi_data.tail()

Unnamed: 0,codeindyr,code,countryname,year,indicator,estimate,stddev,nsource,pctrank,pctranklower,pctrankupper
32095,VIRva2023,VIR,Virgin Islands (U.S.),2023,va,..,..,..,..,..,..
32096,WBGva2023,WBG,West Bank and Gaza,2023,va,-1.118067,0.149837,6,18.137255,11.764706,24.509804
32097,YEMva2023,YEM,"Yemen, Rep.",2023,va,-1.550217,0.131432,8,6.372549,2.45098,11.764706
32098,ZMBva2023,ZMB,Zambia,2023,va,-0.047946,0.118482,12,45.098038,39.215687,52.450981
32099,ZWEva2023,ZWE,Zimbabwe,2023,va,-1.092633,0.118235,13,19.117647,12.745098,24.509804


In [16]:
wgi_data = wgi_data[wgi_data['year'] == 2023]
wgi_data.head()

Unnamed: 0,codeindyr,code,countryname,year,indicator,estimate,stddev,nsource,pctrank,pctranklower,pctrankupper
30816,AFGcc2023,AFG,Afghanistan,2023,cc,-1.154932,0.173359,8,13.679245,6.132075,20.754717
30817,ALBcc2023,ALB,Albania,2023,cc,-0.332219,0.158822,9,43.396225,29.716982,51.886791
30818,DZAcc2023,DZA,Algeria,2023,cc,-0.589308,0.172194,8,30.188679,20.754717,45.28302
30819,ASMcc2023,ASM,American Samoa,2023,cc,1.251356,0.461191,1,87.735847,65.56604,96.698112
30820,ADOcc2023,ADO,Andorra,2023,cc,1.251356,0.461191,1,87.735847,65.56604,96.698112


In [17]:
wgi_data = wgi_data[['code','countryname','estimate']]
wgi_data.head()

Unnamed: 0,code,countryname,estimate
30816,AFG,Afghanistan,-1.154932
30817,ALB,Albania,-0.332219
30818,DZA,Algeria,-0.589308
30819,ASM,American Samoa,1.251356
30820,ADO,Andorra,1.251356


# Data Merging

note: no need to rerun, use merged_df and merged_df2 from data/preprocessed_data

## Left Join country_codes

In [66]:
imports_data.head()

Unnamed: 0,TEU,Month,Origin_Country,HS_Codes
0,0.849421,1,poland,990500
1,1.0,1,italy,330590
2,2.0,1,belgium,730890
3,0.035356,1,germany,580620
4,4.0,1,denmark,392030


In [67]:
country_codes.head()

Unnamed: 0,name,alpha-3
0,afghanistan,AFG
1,åland islands,ALA
2,albania,ALB
3,algeria,DZA
4,american samoa,ASM


In [23]:
imports_data.dtypes

TEU               float64
Month               int64
Origin_Country     object
HS_Codes            int64
dtype: object

In [26]:
imports_data[imports_data['Origin_Country'].isnull()]

Unnamed: 0,TEU,Month,Origin_Country,HS_Codes
656331,1.0,7,,980400
2444879,1.0,3,,430219
3289576,2.0,9,,999900


In [27]:
imports_data.dropna(subset=['Origin_Country'], inplace=True)
imports_data[imports_data['Origin_Country'].isnull()]

Unnamed: 0,TEU,Month,Origin_Country,HS_Codes


In [28]:
imports_data['Origin_Country'] = imports_data['Origin_Country'].apply(lambda x: x.lower())
imports_data.tail()

Unnamed: 0,TEU,Month,Origin_Country,HS_Codes
4158449,1.0,3,china,960390
4158450,0.729055,3,turkey,610433
4158451,0.04195,3,germany,510910
4158452,2.0,3,turkey,680291
4158453,2.0,3,china,950300


In [30]:
country_codes['name'] = country_codes['name'].apply(lambda x: x.lower())
country_codes.head()

Unnamed: 0,name,alpha-3
0,afghanistan,AFG
1,åland islands,ALA
2,albania,ALB
3,algeria,DZA
4,american samoa,ASM


In [None]:
merged_df = pd.merge(imports_data, country_codes, left_on = 'Origin_Country', right_on = 'name', how = 'left')
merged_df.head()

Unnamed: 0,TEU,Month,Origin_Country,HS_Codes,name,alpha-3
0,0.849421,1,poland,990500,poland,POL
1,1.0,1,italy,330590,italy,ITA
2,2.0,1,belgium,730890,belgium,BEL
3,0.035356,1,germany,580620,germany,DEU
4,4.0,1,denmark,392030,denmark,DNK


In [44]:
merged_df['name'].isna().sum()

np.int64(1156290)

In [68]:
mismatched = list(merged_df[merged_df['name'].isna()]['Origin_Country'].unique())
print(mismatched)

['hong kong s.a.r.', 'macau s.a.r.', 'netherlands', 'korea south', 'united states', 'united kingdom', 'czech republic', 'turkey', 'vietnam', 'taiwan', 'iran', 'russia', "cote d'ivoire (ivory coast)", 'swaziland', 'saint helena', 'macedonia', 'laos', 'moldova', 'aland islands', 'virgin islands (us)', 'croatia (hrvatska)', 'reunion', 'bolivia', 'fiji islands', 'venezuela', 'man (isle of)', 'tanzania', 'east timor', 'brunei', 'syria', 'saint-barthelemy', 'vatican city state (holy see)', 'cape verde', 'korea north', 'congo the democratic republic of the', 'kosovo', 'bonaire, saint eustatius and saba', 'palestinian territory occupied']


In [69]:
len(mismatched)

38

In [65]:
for name in mismatched:
    print(name)
    print(country_codes[country_codes['name'].str.startswith(name[:3])])
    print('-'*70)

hong kong s.a.r.
          name alpha-3
99    honduras     HND
100  hong kong     HKG
----------------------------------------------------------------------
macau s.a.r.
      name alpha-3
131  macao     MAC
----------------------------------------------------------------------
netherlands
                            name alpha-3
156  netherlands, kingdom of the     NLD
----------------------------------------------------------------------
korea south
                                       name alpha-3
118  korea, democratic people's republic of     PRK
119                      korea, republic of     KOR
----------------------------------------------------------------------
united states
                                                  name alpha-3
233                               united arab emirates     ARE
234  united kingdom of great britain and northern i...     GBR
235                           united states of america     USA
236               united states minor outlying isla

In [71]:
country_corrections = dict()
for name in mismatched:
    country_corrections.update({name: None})

country_corrections

{'hong kong s.a.r.': None,
 'macau s.a.r.': None,
 'netherlands': None,
 'korea south': None,
 'united states': None,
 'united kingdom': None,
 'czech republic': None,
 'turkey': None,
 'vietnam': None,
 'taiwan': None,
 'iran': None,
 'russia': None,
 "cote d'ivoire (ivory coast)": None,
 'swaziland': None,
 'saint helena': None,
 'macedonia': None,
 'laos': None,
 'moldova': None,
 'aland islands': None,
 'virgin islands (us)': None,
 'croatia (hrvatska)': None,
 'reunion': None,
 'bolivia': None,
 'fiji islands': None,
 'venezuela': None,
 'man (isle of)': None,
 'tanzania': None,
 'east timor': None,
 'brunei': None,
 'syria': None,
 'saint-barthelemy': None,
 'vatican city state (holy see)': None,
 'cape verde': None,
 'korea north': None,
 'congo the democratic republic of the': None,
 'kosovo': None,
 'bonaire, saint eustatius and saba': None,
 'palestinian territory occupied': None}

In [84]:
for name in mismatched:
    print(f'You are correcting {name}:')
    print('Here are your options')
    options = country_codes[country_codes['name'].str.startswith(name[:3])]
    print(options)
    correct_index = input('Which index is correct?')
    if correct_index == 'na':
        print('no equivalent value')
        continue
    print(f'replacing ({name}) with ({options['name'].loc[int(correct_index)]})')
    country_corrections[name] = options['name'].loc[int(correct_index)]
    print('-'*70)

You are correcting hong kong s.a.r.:
Here are your options
          name alpha-3
99    honduras     HND
100  hong kong     HKG
replacing (hong kong s.a.r.) with (hong kong)
----------------------------------------------------------------------
You are correcting macau s.a.r.:
Here are your options
      name alpha-3
131  macao     MAC
replacing (macau s.a.r.) with (macao)
----------------------------------------------------------------------
You are correcting netherlands:
Here are your options
                            name alpha-3
156  netherlands, kingdom of the     NLD
replacing (netherlands) with (netherlands, kingdom of the)
----------------------------------------------------------------------
You are correcting korea south:
Here are your options
                                       name alpha-3
118  korea, democratic people's republic of     PRK
119                      korea, republic of     KOR
replacing (korea south) with (korea, republic of)
---------------------------

In [87]:
nones = [keys for keys in country_corrections.keys() if country_corrections[keys] == None]
nones

['united states',
 'turkey',
 "cote d'ivoire (ivory coast)",
 'swaziland',
 'macedonia',
 'aland islands',
 'virgin islands (us)',
 'reunion',
 'man (isle of)',
 'tanzania',
 'east timor',
 'vatican city state (holy see)',
 'cape verde',
 'korea north',
 'kosovo']

In [88]:
for name in nones:
    print(f'You are correcting {name}:')
    print('Here are your options')
    options = country_codes[country_codes['name'].str.startswith(name[:3])]
    print(options)
    correct_index = input('Which index is correct?')
    if correct_index == 'na':
        print('no equivalent value')
        continue
    print(f'replacing ({name}) with ({options['name'].loc[int(correct_index)]})')
    country_corrections[name] = options['name'].loc[int(correct_index)]
    print('-'*70)

You are correcting united states:
Here are your options
                                                  name alpha-3
233                               united arab emirates     ARE
234  united kingdom of great britain and northern i...     GBR
235                           united states of america     USA
236               united states minor outlying islands     UMI
replacing (united states) with (united states of america)
----------------------------------------------------------------------
You are correcting turkey:
Here are your options
                         name alpha-3
228              turkmenistan     TKM
229  turks and caicos islands     TCA
no equivalent value
You are correcting cote d'ivoire (ivory coast):
Here are your options
Empty DataFrame
Columns: [name, alpha-3]
Index: []
no equivalent value
You are correcting swaziland:
Here are your options
Empty DataFrame
Columns: [name, alpha-3]
Index: []
no equivalent value
You are correcting macedonia:
Here are your options
 

In [89]:
nones = [keys for keys in country_corrections.keys() if country_corrections[keys] == None]
nones

['turkey',
 "cote d'ivoire (ivory coast)",
 'swaziland',
 'macedonia',
 'aland islands',
 'reunion',
 'man (isle of)',
 'east timor',
 'vatican city state (holy see)',
 'cape verde',
 'kosovo']

In [93]:
country_codes[country_codes['name'].str.startswith('tim')]

Unnamed: 0,name,alpha-3
221,timor-leste,TLS


In [106]:
country_corrections['east timor'] = 'timor-leste'
country_corrections['turkey'] = 'türkiye'

In [107]:
imports_data['corrected_name'] = imports_data['Origin_Country'].replace(country_corrections)
imports_data.tail()

Unnamed: 0,TEU,Month,Origin_Country,HS_Codes,corrected_name
4158449,1.0,3,china,960390,china
4158450,0.729055,3,turkey,610433,türkiye
4158451,0.04195,3,germany,510910,germany
4158452,2.0,3,turkey,680291,türkiye
4158453,2.0,3,china,950300,china


In [108]:
merged_df = pd.merge(imports_data, country_codes, left_on = 'corrected_name', right_on = 'name', how = 'left')
merged_df.head()

Unnamed: 0,TEU,Month,Origin_Country,HS_Codes,corrected_name,name,alpha-3
0,0.849421,1,poland,990500,poland,poland,POL
1,1.0,1,italy,330590,italy,italy,ITA
2,2.0,1,belgium,730890,belgium,belgium,BEL
3,0.035356,1,germany,580620,germany,germany,DEU
4,4.0,1,denmark,392030,denmark,denmark,DNK


In [109]:
merged_df[merged_df['name'].isna()]['Origin_Country'].unique()

array(["cote d'ivoire (ivory coast)", 'swaziland', 'macedonia',
       'aland islands', 'reunion', 'man (isle of)',
       'vatican city state (holy see)', 'cape verde', 'kosovo'],
      dtype=object)

In [110]:
merged_df[merged_df['name'].isna()]['Origin_Country'].value_counts()

Origin_Country
cote d'ivoire (ivory coast)      1800
reunion                          1137
aland islands                     233
macedonia                         168
swaziland                         112
vatican city state (holy see)      17
man (isle of)                      13
cape verde                          5
kosovo                              1
Name: count, dtype: int64

In [113]:
merged_df.drop(columns=['corrected_name'], inplace=True)

## merge wgi_data

In [115]:
merged_df.head()

Unnamed: 0,TEU,Month,Origin_Country,HS_Codes,name,alpha-3
0,0.849421,1,poland,990500,poland,POL
1,1.0,1,italy,330590,italy,ITA
2,2.0,1,belgium,730890,belgium,BEL
3,0.035356,1,germany,580620,germany,DEU
4,4.0,1,denmark,392030,denmark,DNK


In [116]:
wgi_data.head()

Unnamed: 0,code,countryname,estimate
30816,AFG,Afghanistan,-1.154932
30817,ALB,Albania,-0.332219
30818,DZA,Algeria,-0.589308
30819,ASM,American Samoa,1.251356
30820,ADO,Andorra,1.251356


In [118]:
merged_df2 = pd.merge(merged_df, wgi_data, left_on = 'alpha-3', right_on = 'code', how = 'left')
merged_df2.head()

Unnamed: 0,TEU,Month,Origin_Country,HS_Codes,name,alpha-3,code,countryname,estimate
0,0.849421,1,poland,990500,poland,POL,POL,Poland,0.5654
1,0.849421,1,poland,990500,poland,POL,POL,Poland,0.421438
2,0.849421,1,poland,990500,poland,POL,POL,Poland,0.559585
3,0.849421,1,poland,990500,poland,POL,POL,Poland,0.458741
4,0.849421,1,poland,990500,poland,POL,POL,Poland,0.780194


In [137]:
merged_df2[merged_df2['code'].isna()][['name','alpha-3','code']]

Unnamed: 0,name,alpha-3,code
672,romania,ROU,
673,romania,ROU,
674,romania,ROU,
675,romania,ROU,
676,romania,ROU,
...,...,...,...
24846444,romania,ROU,
24847261,andorra,AND,
24847262,andorra,AND,
24847263,andorra,AND,


In [142]:
wgi_data[wgi_data['countryname'].str.startswith('Ro')]

Unnamed: 0,code,countryname,estimate
30974,ROM,Romania,0.040495
31188,ROM,Romania,-0.092666
31402,ROM,Romania,0.373644
31616,ROM,Romania,0.437426
31830,ROM,Romania,0.318901
32044,ROM,Romania,0.50318


In [146]:
merged_df.to_csv('../data/processed_data/merged_df.csv')
merged_df2.to_csv('../data/processed_data/merged_df2.csv')