In [354]:
import pandas as pd
import pycountry
import numpy as np
import re
import unidecode

In [355]:
df = pd.read_csv("../output/exports.csv")[["country", "country_of_destination", "year","annex_3", "annex_4_a", "annex_4_b", "amount"]]
df.info()
initial_len = len(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94025 entries, 0 to 94024
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   country                 94025 non-null  object
 1   country_of_destination  93982 non-null  object
 2   year                    94025 non-null  int64 
 3   annex_3                 61801 non-null  object
 4   annex_4_a               29088 non-null  object
 5   annex_4_b               65992 non-null  object
 6   amount                  93990 non-null  object
dtypes: int64(1), object(6)
memory usage: 5.0+ MB


  df = pd.read_csv("../output/exports.csv")[["country", "country_of_destination", "year","annex_3", "annex_4_a", "annex_4_b", "amount"]]


In [356]:
missing = df.isna().sum()
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'missing': missing})

initial_annex_3 = int(missing_value_df.iloc[[3]].missing)

## Normalizing country names
### Converting country name to its alpha 2 code (https://en.wikipedia.org/wiki/ISO_3166-1)

In [357]:
def func(x):
    # -- Cleaning and formatting country name strings to get recognized by pycountry --
    # removing whitespace from beginning and end so country names get recognized properly
    temp = x.strip()
    # e.g. "Venezuela (Bolivarian Republic of) -> "Venezuela, Bolivarian Republic of"
    temp = temp.replace(" (",", ").replace(")","")
    # Côte d´Ivoire --> Côte d'Ivoire
    temp = temp.replace("´", "'")
    # Handling
    if temp == "United Kingdom of Great Britain and Northern Ireland":
        temp = "United Kingdom"
    if temp == "Türkiye":
        temp = "Turkey"
    if temp == "Swaziland":
        temp = "Eswatini"
    if temp == "Republic of Moldova":
        temp = "Moldova, Republic of"
    if temp == "Democratic Republic of the Congo" or temp == "Congo, Democratic Republic of the" or temp == "Congo, Republic of the":
        temp = "Congo, The Democratic Republic of the"
    if temp == "State of Palestine":
        temp = "Palestine, State of"
    if temp == "Bolivia":
        temp = "Bolivia, Plurinational State of"
     # -- Converting country name to alpha_2 code, which is what the other columns use --
    return pycountry.countries.get(name=temp).alpha_2

df["country"] = df["country"].apply(func)

### Checking if alpha 2 codes of other columns are correct

In [358]:
def check(x):
    if not pd.isna(x):
        temp = x.strip()
        temp = unidecode.unidecode(temp)
        temp = temp.replace("\xa0", "")
        if temp == "":
            return np.nan
        temp = "".join([ c if c.isalnum() else "" for c in temp ])
        if temp == "UK":
            temp = "GB"
        country = pycountry.countries.get(alpha_2=temp)
        if country is None:
            country = pycountry.countries.get(alpha_3=temp)
            if country is None:
                # There are still some invalid values left. We treat them as typos and therefore as nan, because we cannot infer the code that the official wanted to enter
                print(x)
                return np.nan
            else:
                return country.alpha_2
        else:
            return temp
    else:
        return np.nan

df["country_of_destination"] = df["country_of_destination"].apply(check)

A
A
A
A
A
A
HL
HL
HL
HL
HL
CS
HL
HL
HL
HL
SP
SP
F
F
GB,B
GB,B
GB,B
GB,B
GB,B
TR,D
BY,D
Eire
YU


In [359]:
df[(df["year"] == 2016) & (df["country"] == "AR")].country_of_destination.iloc[[0]].values[0]

'FR'

## Cleaning code columns (annex_3, annex_4_a, annex_4_b)

In [360]:
# NEEDS CONFIRMATION
# Valid codes
unique_items_a_3 = ("H1","H3","H4.1","H4.2","H4.3","H5.1","H5.2","H6.1","H6.2","H8","H10","H11","H12","H13")
unique_items_a_4_a = [f"D{x}" for x in range(1,17)]
unique_items_a_4_b = [f"R{x}" for x in range(1,14)]

In [361]:
def init_clean(x):
    temp = x.upper()
    temp = temp.strip()
    # deleting .0 because not part of official codes
    temp = temp.replace(".0", "")
    #sometimes someone writes e.g. H03, but 0 never part of official codes
    temp = temp.replace("0","")
    #sometimes people use whitespace to separate the letter and the number
    temp = temp.replace(" ","")
    return temp

def cleaning_codes(x, letter, unique):
    if not pd.isna(x):
        if x == "" or x == "--" or x == "-":
            return np.nan
        temp = unidecode.unidecode(x)
        # converting cell to list, because sometimes it contains more than one value, replacing other possible separators with commas
        temp = temp.replace("/", ",").replace(";",",").replace("\n",",").replace("，", ",")
        # basic cleaning
        lst = [init_clean(x) for x in temp.split(",")]
        # NEEDS CONFIRMATION
        # sometimes cell contains only a number. we are assuming they just didn't add the letter (H for example) in this case
        lst = [letter + x if letter not in x else x for x in lst]

        # NEEDS CONFIRMATION
        # sometimes a cell contains something like this: "R_". We are treating this as nan
        lst = [x for x in lst if not f"{letter}_" in x and not f"{letter}*" in x]
        if lst != []:
            return lst
        else:
            return np.nan
    else:
        return x

In [362]:
df["annex_3"] = df.apply(lambda x: cleaning_codes(x["annex_3"], "H", unique_items_a_3), axis=1)
df["annex_4_a"] = df.apply(lambda x: cleaning_codes(x["annex_4_a"], "D", unique_items_a_4_a), axis=1)
df["annex_4_b"] = df.apply(lambda x: cleaning_codes(x["annex_4_b"], "R", unique_items_a_4_b), axis=1)

There are still some invalid values left. Some are just typos.We convert them to nan because we cannot infer the code that the official wanted to enter

In [363]:
# I think we can improve cleaning
list(set(list(df[df["annex_3"].notna()]["annex_3"].explode().unique())) - set(unique_items_a_3))

['HN',
 "H'7",
 'HA',
 'H3H4.1H12H13',
 'H61',
 'HE12',
 'H4',
 'HNR',
 'H3H12',
 'H112',
 'H3B',
 'H6.1(H3)',
 'HP15',
 'HO(CAN)',
 'H4.1H12H13',
 'HE11',
 'HE13',
 'HEJFARL',
 'H6.H8',
 'HP6',
 'H6',
 'H3H4.1H12',
 'HP14EU',
 'H3H6.1H11H12H13',
 'HN4.1',
 'HNA',
 'H6.1(3)',
 'H8.',
 'H8.H12',
 'H3H4.1H6.1H12H13',
 'H4999999999999996',
 'H3.4',
 'H5.5',
 'H17',
 'H12H13',
 'H',
 'H6.1.',
 'H3.',
 'H4.1.H6.1',
 'H3H6.1H11H12',
 'H6.2.',
 '1-H11',
 'H4.1.',
 'HE4.3',
 'HNEJ',
 'HNOTLISTED',
 'HNOTSPECIFIED',
 'H41',
 'H8H12',
 'HE9',
 'H4.2H4.3',
 'HETC',
 'H9',
 'H4.1H6.1',
 'H12.1',
 'H2',
 'H1.1',
 'HE3',
 'H6.1.H11',
 'H3H4.1H5.1H6.1H8H11H12',
 'H4.1H4.2',
 'H3.1',
 'HN11',
 'H6.1H8',
 'H34',
 'HY18',
 'H3A',
 'H14',
 'H6.1H11H12',
 'H11H12',
 'H5',
 'H1AH13',
 'H3H6.1',
 'H6.1-8',
 'H31',
 'H11.12',
 "H'6.1",
 'H3H6.1H8',
 'HP3',
 'H5H13',
 'HP14',
 'H8.1',
 'H12.',
 'H6.1H',
 'H7',
 'H12HAZ',
 'H3H4.1',
 'H4.1H5.1H6.1H8H11H12H13',
 'H33-35',
 'HN13',
 'HAZ',
 'H1.8',
 'H811',
 'H4

In [364]:
list(set(list(df[df["annex_4_a"].notna()]["annex_4_a"].explode().unique())) - set(unique_items_a_4_a))

['DXX',
 'DUA',
 'HD14',
 'D18',
 'DZA',
 'DAT',
 'DR4',
 'DIL',
 'DR3',
 'DKR',
 'DPR',
 'DLV',
 'DR5',
 'DGB',
 'D13+D1',
 'DJP',
 'D',
 'D17',
 'D19',
 'DE',
 'DBE',
 'DCN']

In [365]:
list(set(list(df[df["annex_4_b"].notna()]["annex_4_b"].explode().unique())) - set(unique_items_a_4_b))

['R', 'RD12', 'RD1', 'R14', 'RXX', 'R15', 'MIXEDR', 'R16']

In [366]:
def func(lst,valid):
    if isinstance(lst, list):
        temp = [x for x in lst if x in valid]
        if temp != []:
            return temp
        else:
            return np.nan
    else:
        return np.nan

In [367]:
df["annex_3"] = df.apply(lambda x: func(x["annex_3"], unique_items_a_3), axis=1)
df["annex_4_a"] = df.apply(lambda x: func(x["annex_4_a"], unique_items_a_4_a), axis=1)
df["annex_4_b"] = df.apply(lambda x: func(x["annex_4_b"], unique_items_a_4_b), axis=1)

## Cleaning amount

In [368]:
def func(x):
    if not pd.isna(x):
        if isinstance(x, float):
            return x
        else:
            temp = str(x)
            # reomving whitespace
            temp = temp.replace(" ", "")
            # converting , to .
            temp = temp.replace(",",".")
            # replacing all non numeric characters but .
            temp = re.sub("[^0-9.]", "", temp)
            temp = float(temp)
            return temp
    else:
        return np.nan

df["amount"] = df.apply(lambda x: func(x["amount"]), axis=1)

## Dealing with missing values

We are going to drop all rows that have missing values for amount and country_of_destination.

In [369]:
df = df[df['amount'].notna()]
df = df[df['country_of_destination'].notna()]

In [370]:
print(f"Lost {initial_len - len(df)} rows")

Lost 84 rows


In [371]:
missing = df.isna().sum()
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'missing': missing})

print(f" after processing {int(missing_value_df.iloc[[3]].missing) - initial_annex_3} less rows of annex_3")

 after processing 6630 less rows of annex_3


## Creating separate tables for codes

In [372]:
# Source: https://towardsdatascience.com/dealing-with-list-values-in-pandas-dataframes-a177e534f173
def boolean_df(item_lists, unique_items):# Create empty dict
    bool_dict = {}

    # Loop through all the tags
    for i, item in enumerate(unique_items):
        # Apply boolean mask
        bool_dict[item] = item_lists.apply(lambda x: item in x)

    # Return the results as a dataframe
    return pd.DataFrame(bool_dict)

In [373]:
df_bool_h = boolean_df(df[df["annex_3"].notna()]["annex_3"], unique_items_a_3)
df_bool_d = boolean_df(df[df['annex_4_a'].notna()]["annex_4_a"], unique_items_a_4_a)
df_bool_r = boolean_df(df[df["annex_4_b"].notna()]["annex_4_b"], unique_items_a_4_b)

In [374]:
df = pd.concat([df[["country", "country_of_destination", "year", "amount"]], df_bool_h, df_bool_d, df_bool_r], axis=1)
df.reset_index(drop=True, inplace=True)
df[df.columns.intersection([*unique_items_a_3, *unique_items_a_4_a, *unique_items_a_4_b])] = df[df.columns.intersection([*unique_items_a_3, *unique_items_a_4_a, *unique_items_a_4_b])].fillna(False)

In [375]:
df.to_csv("../output/processed/clean.csv")