In [904]:
import pandas as pd
import pycountry
import numpy as np

In [905]:
df = pd.read_csv("../output/exports.csv")[["country", "country_of_destination", "year","annex_3", "annex_4_a", "annex_4_b", "amount"]]

## Normalizing country names

In [906]:
def func(x):
    # removing whitespace from beginning and end so country names get recognized properly
    temp = x.strip()
    # e.g. "Venezuela (Bolivarian Republic of) -> "Venezuela, Bolivarian Republic of"
    temp = temp.replace(" (",", ").replace(")","")
    temp = pycountry.countries.get(name=temp).alpha_2
    return temp

df["country"] = df["country"].apply(func)

## Cleaning code columns (annex_3, annex_4_a, annex_4_b)

In [907]:
# NEEDS CONFIRMATION
# Valid codes
unique_items_a_3 = ("H1","H3","H4.1","H4.2","H4.3","H5.1","H5.2","H6.1","H6.2","H8","H10","H11","H12","H13")
unique_items_a_4_a = (f"D{x}" for x in range(1,17))
unique_items_a_4_b = (f"R{x}" for x in range(1,14))

In [908]:
def cleaning_codes(x, letter):
    if not pd.isna(x):
        # converting cell to list, because sometimes it contains more than one value and converting to uppercase
        lst = [x.upper() for x in x.split(",")]
        # NEEDS CONFIRMATION
        # sometimes cell contains only a number. we are assuming they just didn't add the letter (H for example) in this case
        lst = [letter + x if letter not in x else x for x in lst]
        # deleting .0 because not part of official codes
        lst = [x.replace(".0", "") for x in lst]
        # NEEDS CONFIRMATION
        # sometimes a cell contains something like this: "R_". We are treating this as nan
        lst = [x for x in lst if not "_" in x]
        if lst != []:
            return lst
        else:
            return np.nan
    else:
        return x

In [909]:
df["annex_3"] = df.apply(lambda x: cleaning_codes(x["annex_3"], "H"), axis=1)
df["annex_4_a"] = df.apply(lambda x: cleaning_codes(x["annex_4_a"], "D"), axis=1)
df["annex_4_b"] = df.apply(lambda x: cleaning_codes(x["annex_4_b"], "R"), axis=1)

There are still some invalid values left. We treat them as typos and therefore as nan, because we cannot infer the code that the official wanted to enter

In [910]:
# set(list(df[df["annex_3"].notna()]["annex_3"].explode().unique())) ^ set(unique_items_a_3)
list(df[df["annex_3"].notna()]["annex_3"].explode().unique())

['H11',
 'H4.1',
 'H8',
 'H14',
 'H12',
 'H13',
 'H6.1',
 'H4.2',
 'H6.2',
 'H3',
 'H4.3',
 'H112']

In [911]:
list(df[df["annex_4_a"].notna()]["annex_4_a"].explode().unique())

['D10', 'D9', 'D5', 'D12', 'D15', 'D13', 'D1']

In [912]:
list(df[df["annex_4_b"].notna()]["annex_4_b"].explode().unique())

['R4', 'R2', 'R6', 'R9', 'R3', 'R13', 'R8', 'R5', 'R12', 'R1', 'R11']

In [913]:
def func(lst,valid):
    if isinstance(lst, list):
        temp = [x for x in lst if x in valid]
        if temp != []:
            return temp
        else:
            return np.nan
    else:
        return np.nan

In [914]:
df["annex_3"] = df.apply(lambda x: func(x["annex_3"], unique_items_a_3), axis=1)
df["annex_4_a"] = df.apply(lambda x: func(x["annex_4_a"], unique_items_a_4_a), axis=1)
df["annex_4_b"] = df.apply(lambda x: func(x["annex_4_b"], unique_items_a_4_b), axis=1)

## Creating separate tables for codes

In [915]:
# Source: https://towardsdatascience.com/dealing-with-list-values-in-pandas-dataframes-a177e534f173
def boolean_df(item_lists, unique_items):# Create empty dict
    bool_dict = {}

    # Loop through all the tags
    for i, item in enumerate(unique_items):
        # Apply boolean mask
        bool_dict[item] = item_lists.apply(lambda x: item in x)

    # Return the results as a dataframe
    return pd.DataFrame(bool_dict)

In [916]:
df_bool_h = boolean_df(df[df["annex_3"].notna()]["annex_3"], unique_items_a_3)
df_bool_d = boolean_df(df[df['annex_4_a'].notna()]["annex_4_a"], unique_items_a_4_a)
df_bool_r = boolean_df(df[df["annex_4_b"].notna()]["annex_4_b"], unique_items_a_4_b)

In [917]:
df_bool_h

Unnamed: 0,H1,H3,H4.1,H4.2,H4.3,H5.1,H5.2,H6.1,H6.2,H8,H10,H11,H12,H13
0,False,False,False,False,False,False,False,False,False,False,False,True,False,False
1,False,False,True,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,True,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,True,False,False
4,False,False,False,False,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,False,False,False,False,False,False,False,False,False,False,False,True,False,False
201,False,False,False,False,False,False,False,False,False,False,False,False,False,True
202,False,False,False,False,False,False,False,False,False,False,False,False,False,True
203,False,False,False,False,False,False,False,False,False,False,False,True,False,False
