In [106]:
import pandas as pd
import pycountry
import numpy as np
import re
import os
from dotenv import load_dotenv
import unidecode
import pycountry
from geojson import FeatureCollection, dump
import requests
from collections.abc import Iterable

In [107]:
df = pd.read_csv("../output/exports.csv")[["country", "country_of_destination", "year","annex_3", "annex_4_a", "annex_4_b", "amount"]]
df.info()
initial_len = len(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94025 entries, 0 to 94024
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   country                 94025 non-null  object
 1   country_of_destination  93982 non-null  object
 2   year                    94025 non-null  int64 
 3   annex_3                 61801 non-null  object
 4   annex_4_a               29088 non-null  object
 5   annex_4_b               65992 non-null  object
 6   amount                  93990 non-null  object
dtypes: int64(1), object(6)
memory usage: 5.0+ MB


  df = pd.read_csv("../output/exports.csv")[["country", "country_of_destination", "year","annex_3", "annex_4_a", "annex_4_b", "amount"]]


In [108]:
missing = df.isna().sum()
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'missing': missing})

initial_annex_3 = int(missing_value_df.iloc[[3]].missing)

In [109]:
df

Unnamed: 0,country,country_of_destination,year,annex_3,annex_4_a,annex_4_b,amount
0,Andorra,NZ,2021,H3,,"R1,R13",500.0
1,Andorra,PG,2021,"H6.1,H8,H10","D5,D9",,100.0
2,Andorra,PG,2021,"H6.1,H8,H10","D5,D9",,100.0
3,Andorra,NZ,2021,"H6.1,H11,H12",,"R1,R13",300.0
4,Andorra,NZ,2021,"H6.1,H11,H12",D10,,250.0
...,...,...,...,...,...,...,...
94020,United Kingdom of Great Britain and Northern I...,DE,2001,H4.2,,R4,105.98
94021,United Kingdom of Great Britain and Northern I...,DE,2001,H4.2,,R4,105.98
94022,United Kingdom of Great Britain and Northern I...,NO,2001,"H10,H11",,"R3,R4",6.0
94023,Uzbekistan,KZ,2001,,,R2,1683.7


## Normalizing country names
### Converting country name to its alpha 2 code (https://en.wikipedia.org/wiki/ISO_3166-1)

In [110]:
def func(x):
    # -- Cleaning and formatting country name strings to get recognized by pycountry --
    # removing whitespace from beginning and end so country names get recognized properly
    temp = x.strip()
    # e.g. "Venezuela (Bolivarian Republic of) -> "Venezuela, Bolivarian Republic of"
    temp = temp.replace(" (",", ").replace(")","")
    # Côte d´Ivoire --> Côte d'Ivoire
    temp = temp.replace("´", "'")
    # Handling
    if temp == "United Kingdom of Great Britain and Northern Ireland":
        temp = "United Kingdom"
    if temp == "Türkiye":
        temp = "Turkey"
    if temp == "Swaziland":
        temp = "Eswatini"
    if temp == "Republic of Moldova":
        temp = "Moldova, Republic of"
    if temp == "Democratic Republic of the Congo" or temp == "Congo, Democratic Republic of the" or temp == "Congo, Republic of the":
        temp = "Congo, The Democratic Republic of the"
    if temp == "State of Palestine":
        temp = "Palestine, State of"
    if temp == "Bolivia":
        temp = "Bolivia, Plurinational State of"
     # -- Converting country name to alpha_2 code, which is what the other columns use --
    return pycountry.countries.get(name=temp).alpha_2.lower()

df["country"] = df["country"].apply(func)

### Checking if alpha 2 codes of other columns are correct

In [111]:
def check(x):
    if not pd.isna(x):
        temp = x.strip()
        temp = unidecode.unidecode(temp)
        temp = temp.replace("\xa0", "")
        if temp == "":
            return np.nan
        temp = "".join([ c if c.isalnum() else "" for c in temp ])
        if temp == "UK":
            temp = "GB"
        country = pycountry.countries.get(alpha_2=temp)
        if country is None:
            country = pycountry.countries.get(alpha_3=temp)
            if country is None:
                # There are still some invalid values left. We treat them as typos and therefore as nan, because we cannot infer the code that the official wanted to enter
                print(x)
                return np.nan
            else:
                return country.alpha_2.lower()
        else:
            return temp.lower()
    else:
        return np.nan

df["country_of_destination"] = df["country_of_destination"].apply(check)

A
A
A
A
A
A
HL
HL
HL
HL
HL
CS
HL
HL
HL
HL
SP
SP
F
F
GB,B
GB,B
GB,B
GB,B
GB,B
TR,D
BY,D
Eire
YU


## Cleaning code columns (annex_3, annex_4_a, annex_4_b)

In [112]:
# NEEDS CONFIRMATION
# Valid codes
unique_items_a_3 = ("H1","H3","H4.1","H4.2","H4.3","H5.1","H5.2","H6.1","H6.2","H8","H10","H11","H12","H13")
unique_items_un = ["UN1","UN3","UN4.1","UN4.2","UN4.3","UN5.1","UN5.2","UN6.1","UN6.2","UN8","UN9"]
unique_items_a_4_a = [f"D{x}" for x in range(1,17)]
unique_items_a_4_b = [f"R{x}" for x in range(1,14)]

In [113]:
def init_clean(x):
    temp = x.upper()
    temp = temp.strip()
    # deleting .0 because not part of official codes
    temp = temp.replace(".0", "")
    #sometimes someone writes e.g. H03, but 0 never part of official codes
    temp = temp.replace("0","")
    #sometimes people use whitespace to separate the letter and the number
    #temp = temp.replace(" ","")
    
    # additional
    temp = temp.replace("E","")
    temp = temp.replace("N","")
    temp = temp.replace("'","")
    temp2 = list(temp)
    for i, v in enumerate(temp2):
        if i>0 and v==' ' and temp2[i-1] == 'H':
            del temp2[i] 
            del i
    temp = "".join(temp2) 
    
    d = 'H' 
    temp = [d + e for e in temp.split(d)]
    temp = [e[:-1] if e[-1] == "." else e for e in temp]
    temp = [e[:-1] if e[-1] == "H" else e for e in temp]
    temp = [e.replace(' ', '') for e in temp]
    temp = [e.replace('Y', '') for e in temp]
    temp = [e.replace('B', '') for e in temp]
    temp = [e.replace('(', '') for e in temp]
    temp = [e.replace('P', '') for e in temp]
    temp = [e.replace(')', '') for e in temp]
    temp = [e.replace('A', '') for e in temp]
    temp = list(filter(None, temp))
    return temp

def cleaning_codes(x, letter, unique):
    if not pd.isna(x):
        if x == "" or x == "--" or x == "-":
            return np.nan
        temp = unidecode.unidecode(x)
        # converting cell to list, because sometimes it contains more than one value, replacing other possible separators with commas
        temp = temp.replace("/", ",").replace(";",",").replace("\n",",").replace("，", ",")
        # basic cleaning
        lst = [init_clean(x) for x in temp.split(",")]
        
        # NEW: flatten list if nested 
        lst = flatten(lst)
        
        # NEEDS CONFIRMATION
        # sometimes cell contains only a number. we are assuming they just didn't add the letter (H for example) in this case
        lst = [letter + x if letter not in x else x for x in lst]

        # NEEDS CONFIRMATION
        # sometimes a cell contains something like this: "R_". We are treating this as nan
        lst = [x for x in lst if not f"{letter}_" in x and not f"{letter}*" in x]
        if lst != []:
           # print(lst)
         #   lst = [d + e for e in lst[0].split(d) if e and len(lst)==1]
         #   lst = [e[:-1] if e[-1] == "." else e for e in lst]
            return lst
        else:
            return np.nan
    else:
        return x

def flatten(xs):
    for x in xs:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            yield from flatten(x)
        else:
            yield x

In [114]:
df["annex_3"] = df.apply(lambda x: cleaning_codes(x["annex_3"], "H", unique_items_a_3), axis=1)
df["annex_4_a"] = df.apply(lambda x: cleaning_codes(x["annex_4_a"], "D", unique_items_a_4_a), axis=1)
df["annex_4_b"] = df.apply(lambda x: cleaning_codes(x["annex_4_b"], "R", unique_items_a_4_b), axis=1)

There are still some invalid values left. Some are just typos.We convert them to nan because we cannot infer the code that the official wanted to enter

In [115]:
# I think we can improve cleaning
v = list(set(list(df[df["annex_3"].notna()]["annex_3"].explode().unique())) - set(unique_items_a_3))
print(len(v))
v # 107 -> 97 -> 88 -> 62 -> 56 -> 54 -> 52 -> 49 -> 45 -> 42 -> 45 

45


['H6.1.',
 'H4',
 'H8.',
 'H17',
 'HXX',
 'HR',
 'H12.1',
 'H14',
 'H14U',
 'HOC',
 'H3.1',
 'HJ',
 'H5',
 'H1.1',
 'HOTSCIFID',
 'H9',
 'H7',
 'H11.12',
 'H1.8',
 'H4.4',
 'H6',
 'H6.13',
 'H4999999999999996',
 'H8.1',
 'H18',
 'H34',
 'HTC',
 'HZ',
 'H61',
 'H41',
 'H112',
 'H15',
 'H5.5',
 'H31',
 'H4.1.',
 'H2',
 'H811',
 'H.',
 'HOTLISTD',
 'H3.4',
 'H6.1-8',
 'H1-',
 'H33-35',
 'H',
 'HJFRL']

In [116]:
list(set(list(df[df["annex_4_a"].notna()]["annex_4_a"].explode().unique())) - set(unique_items_a_4_a))

['HD1',
 'HD',
 'HD19',
 'HD15',
 'DHR',
 'DHJ',
 'DHZ',
 'HD18',
 'HD4',
 'DHLV',
 'HD8',
 'DHU',
 'HD12',
 'DHC',
 'DH',
 'DHR5',
 'HD16',
 'DHT',
 'HD2',
 'HD9',
 'HD17',
 'DHR3',
 'HD14',
 'DHKR',
 'HD6',
 'DH13',
 'HD13',
 'HD3',
 'HD13+D1',
 'HDXX',
 'DHG',
 'DHR4',
 'HD5',
 'HD11',
 'DHIL',
 'DH1']

In [117]:
list(set(list(df[df["annex_4_b"].notna()]["annex_4_b"].explode().unique())) - set(unique_items_a_4_b))

['HR14',
 'HRXX',
 'HR12',
 'HR',
 'RHD1',
 'HR11',
 'RH9',
 'HR9',
 'RH5',
 'HR16',
 'HMIXDR',
 'HR8',
 'HR6',
 'HR5',
 'HR7',
 'HR1',
 'RHD12',
 'HR4',
 'HR15',
 'HR3',
 'HR13',
 'HR2']

In [118]:
def func(lst,valid):
    if isinstance(lst, list):
        temp = [x for x in lst if x in valid]
        if temp != []:
            return temp
        else:
            return np.nan
    else:
        return np.nan

In [119]:
df["annex_3"] = df.apply(lambda x: func(x["annex_3"], unique_items_a_3), axis=1)
df["annex_4_a"] = df.apply(lambda x: func(x["annex_4_a"], unique_items_a_4_a), axis=1)
df["annex_4_b"] = df.apply(lambda x: func(x["annex_4_b"], unique_items_a_4_b), axis=1)

In [120]:
df['annex_3'].isnull().sum() # urspr. 38 909 -> 38 728 -> 38 720 -> 38 718
#len(df) #-> 93004

38718

In [121]:
def un_code(h_code):
    if isinstance(h_code, list):
        return [s.replace("H", "UN") if s not in ("H10", "H11", "H12", "H13") else "UN9" for s in h_code]
    else:
        return np.nan

In [122]:
df['UN_code'] = df['annex_3'].apply(un_code)

## Cleaning amount

In [123]:
def func(x):
    if not pd.isna(x):
        if isinstance(x, float):
            return x
        else:
            temp = str(x)
            # removing whitespace
            temp = temp.replace(" ", "")
            # converting , to .
            temp = temp.replace(",",".")
            # replacing all non numeric characters but .
            temp = re.sub("[^0-9.]", "", temp)
            temp = float(temp)
            return temp
    else:
        return np.nan

df["amount"] = df.apply(lambda x: func(x["amount"]), axis=1)

## Cleaning year

In [124]:
def func(x):
    if not pd.isna(x):
        return int(x)
    else:
        return np.nan

df["year"] = df.apply(lambda x: func(x["year"]), axis=1)

## Dealing with missing values

We are going to drop all rows that have missing values for amount and country_of_destination.

In [125]:
df[df["country"]=="NA"] = np.nan
df = df[df['amount'].notna()]
df = df[df['country_of_destination'].notna()]
df = df[df['country'].notna()]

In [126]:
print(f"Lost {initial_len - len(df)} rows")

Lost 84 rows


In [127]:
missing = df.isna().sum()
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'missing': missing})

print(f" after processing {int(missing_value_df.iloc[[3]].missing) - initial_annex_3} less rows of annex_3. Initial value: {initial_annex_3}")
df

 after processing 6443 less rows of annex_3. Initial value: 32224


Unnamed: 0,country,country_of_destination,year,annex_3,annex_4_a,annex_4_b,amount,UN_code
0,ad,nz,2021.0,[H3],,,500.00,[UN3]
1,ad,pg,2021.0,"[H6.1, H8, H1]",,,100.00,"[UN6.1, UN8, UN1]"
2,ad,pg,2021.0,"[H6.1, H8, H1]",,,100.00,"[UN6.1, UN8, UN1]"
3,ad,nz,2021.0,"[H6.1, H11, H12]",,,300.00,"[UN6.1, UN9, UN9]"
4,ad,nz,2021.0,"[H6.1, H11, H12]",,,250.00,"[UN6.1, UN9, UN9]"
...,...,...,...,...,...,...,...,...
94020,gb,de,2001.0,[H4.2],,,105.98,[UN4.2]
94021,gb,de,2001.0,[H4.2],,,105.98,[UN4.2]
94022,gb,no,2001.0,"[H1, H11]",,,6.00,"[UN1, UN9]"
94023,uz,kz,2001.0,,,,1683.70,


## Dealing with same origin & destination country

In [128]:
C = np.where(df.country_of_destination==df.country)
C = list(C[0])

In [129]:
df = df.drop(index=C)

In [130]:
print(f"After deletion of flows with same origin & destination in total {len(C)} less rows")

After deletion of flows with same origin & destination in total 937 less rows


In [131]:
# UN code 9 often more than one time in UN_code list
def unique(h_code):
    if type(h_code)!= float:
        return np.unique(h_code)
    else:
        return np.nan

In [132]:
df['UN_code'] = df['UN_code'].apply(unique)
df

Unnamed: 0,country,country_of_destination,year,annex_3,annex_4_a,annex_4_b,amount,UN_code
0,ad,nz,2021.0,[H3],,,500.00,[UN3]
1,ad,pg,2021.0,"[H6.1, H8, H1]",,,100.00,"[UN1, UN6.1, UN8]"
2,ad,pg,2021.0,"[H6.1, H8, H1]",,,100.00,"[UN1, UN6.1, UN8]"
3,ad,nz,2021.0,"[H6.1, H11, H12]",,,300.00,"[UN6.1, UN9]"
4,ad,nz,2021.0,"[H6.1, H11, H12]",,,250.00,"[UN6.1, UN9]"
...,...,...,...,...,...,...,...,...
94020,gb,de,2001.0,[H4.2],,,105.98,[UN4.2]
94021,gb,de,2001.0,[H4.2],,,105.98,[UN4.2]
94022,gb,no,2001.0,"[H1, H11]",,,6.00,"[UN1, UN9]"
94023,uz,kz,2001.0,,,,1683.70,


In [133]:
newdf = df[df["UN_code"].apply(lambda x: type(x) != float)]

In [134]:
newerdf = newdf[newdf["UN_code"].apply(lambda x: len(x) > 1)]
newerdf
print(f'{newerdf["amount"].sum() / df["amount"].sum() *100} percent of the total amount belongs to multiple categories.')

2.2850788773185773 percent of the total amount belongs to multiple categories.


In [135]:
len(df)-len(newdf)
len(newerdf)

8081

In [136]:
noCode = df[df["UN_code"].apply(lambda x: type(x) == float)]
total_amount = df["amount"].sum()
print(f'{noCode["amount"].sum() / df["amount"].sum() *100} percent of the total amount belongs NaN h code.') #65.91

65.8062173294868 percent of the total amount belongs NaN h code.


## Creating separate columns for codes

In [137]:
df[unique_items_un + ["multiple", "unspecified"]] = 0
for index, row in df.iterrows():
    if type(row["UN_code"]) != float:
        if len(row["UN_code"]) == 1:
            df.loc[index, row["UN_code"][0]] = row["amount"]
        else:
            df.loc[index, "multiple"] = row["amount"]
    else:
        df.loc[index, "unspecified"] = row["amount"]

## Renaming and deleting colums

In [138]:
total_amount = df["amount"].sum()
df.rename(columns={"country": "origin", "country_of_destination": "destination"}, inplace=True)
df.drop(['amount', 'UN_code', "annex_3", "annex_4_a", "annex_4_b"], axis=1, inplace=True)

## Aggregating


In [139]:
cols = ["origin","destination","year"]
agg_functions = {"UN1" : "sum" ,"UN3": "sum","UN4.1": "sum","UN4.2": "sum","UN4.3": "sum","UN5.1": "sum","UN5.2": "sum","UN6.1": "sum","UN6.2": "sum","UN8": "sum","UN9": "sum", "unspecified": "sum", "multiple": "sum"}
df_new = df.groupby(cols, dropna=False).aggregate(agg_functions).reset_index()

In [140]:
df_new

Unnamed: 0,origin,destination,year,UN1,UN3,UN4.1,UN4.2,UN4.3,UN5.1,UN5.2,UN6.1,UN6.2,UN8,UN9,unspecified,multiple
0,ad,es,2002.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7777.400,0.00
1,ad,es,2003.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,486.800,0.00
2,ad,es,2004.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,425.980,0.00
3,ad,es,2005.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,621.950,0.00
4,ad,es,2006.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1044.092,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8114,za,se,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,0.000,0.00
8115,za,sg,2010.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,720.0,0.000,0.00
8116,za,tr,2018.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,21.24
8117,zm,fi,2002.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,235.000,0.00


In [141]:
overall = ["UN1", "UN3", "UN4.1", "UN4.2", "UN4.3", "UN5.1", "UN5.2", "UN6.1", "UN6.2", "UN8", "UN9"]
print(f'{df_new["multiple"].sum() / total_amount *100} percent of total amount is multiple')
print(f'{(df_new["UN1"].sum()) / total_amount *100} percent of the total amount belongs UN1.')
print(f'{(df_new["UN3"].sum()) / total_amount *100} percent of the total amount belongs UN3.')
print(f'{(df_new["UN4.1"].sum()) / total_amount *100} percent of the total amount belongs UN4.1.')
print(f'{(df_new["UN4.2"].sum()) / total_amount *100} percent of the total amount belongs UN4.2.')
print(f'{(df_new["UN4.3"].sum()) / total_amount *100} percent of the total amount belongs UN4.3.')
print(f'{(df_new["UN5.1"].sum()) / total_amount *100} percent of the total amount belongs UN5.1.')
print(f'{(df_new["UN5.2"].sum()) / total_amount *100} percent of the total amount belongs UN5.2.')
print(f'{(df_new["UN6.1"].sum()) / total_amount *100} percent of the total amount belongs UN6.1.')
print(f'{(df_new["UN6.2"].sum()) / total_amount *100} percent of the total amount belongs UN6.2.')
print(f'{(df_new["UN8"].sum()) / total_amount *100} percent of the total amount belongs UN8.')
print(f'{(df_new["UN9"].sum()) / total_amount *100} percent of the total amount belongs UN9.')
print(f'{df_new["unspecified"].sum() / total_amount *100} percent of the total amount is unspecified.')

2.2850788773185777 percent of total amount is multiple
0.15751210434176735 percent of the total amount belongs UN1.
1.9519335402276246 percent of the total amount belongs UN3.
1.7667412981062773 percent of the total amount belongs UN4.1.
0.1044700110039856 percent of the total amount belongs UN4.2.
0.7825004992082472 percent of the total amount belongs UN4.3.
0.0904507944169156 percent of the total amount belongs UN5.1.
0.005156365279592412 percent of the total amount belongs UN5.2.
2.6801735314085673 percent of the total amount belongs UN6.1.
0.40651103679667727 percent of the total amount belongs UN6.2.
3.464927972355394 percent of the total amount belongs UN8.
20.498326640049562 percent of the total amount belongs UN9.
65.8062173294868 percent of the total amount is unspecified.


In [142]:
print(f"{len(df_new[(df_new.iloc[:,3:14] == 0).all(axis=1)])} of {len(df_new)} rows have an amount of zero everywhere but unspecified or multiple")
print(f"{len(df_new[(df_new.iloc[:,3:15] == 0).all(axis=1)])} of {len(df_new)} rows have an amount of zero everywhere but unspecified")
print(f"{len(df_new[(df_new.iloc[:,14:] == 0).all(axis=1)])} of {len(df_new)} rows have no amount on unspecified and multiple")
print(f"{len(df_new[(df_new.iloc[:,14:15] != 0).all(axis=1)])} of {len(df_new)} rows have an amount on unspecified")
print(f"{len(df_new[(df_new.iloc[:,15:16] != 0).all(axis=1)])} of {len(df_new)} rows have an amount on multiple")

3198 of 8119 rows have an amount of zero everywhere but unspecified or multiple
525 of 8119 rows have an amount of zero everywhere but unspecified
2776 of 8119 rows have no amount on unspecified and multiple
4237 of 8119 rows have an amount on unspecified
1649 of 8119 rows have an amount on multiple


In [143]:
def func(x):
    if not pd.isna(x):
        return int(x)
    else:
        return np.nan

df_new["year"] = df_new.apply(lambda x: func(x["year"]), axis=1)
df_new

Unnamed: 0,origin,destination,year,UN1,UN3,UN4.1,UN4.2,UN4.3,UN5.1,UN5.2,UN6.1,UN6.2,UN8,UN9,unspecified,multiple
0,ad,es,2002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7777.400,0.00
1,ad,es,2003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,486.800,0.00
2,ad,es,2004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,425.980,0.00
3,ad,es,2005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,621.950,0.00
4,ad,es,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1044.092,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8114,za,se,2015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,0.000,0.00
8115,za,sg,2010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,720.0,0.000,0.00
8116,za,tr,2018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,21.24
8117,zm,fi,2002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,235.000,0.00


In [144]:
df_new[df_new["origin"]=="tj"]

Unnamed: 0,origin,destination,year,UN1,UN3,UN4.1,UN4.2,UN4.3,UN5.1,UN5.2,UN6.1,UN6.2,UN8,UN9,unspecified,multiple


In [145]:
df_new.to_csv("../output/processed/flows.csv")

In [146]:
# # Source: https://towardsdatascience.com/dealing-with-list-values-in-pandas-dataframes-a177e534f173
# def boolean_df(item_lists, unique_items):# Create empty dict
#     bool_dict = {}
#
#     # Loop through all the tags
#     for i, item in enumerate(unique_items):
#         # Apply boolean mask
#         bool_dict[item] = item_lists.apply(lambda x: item in x)
#
#     # Return the results as a dataframe
#     return pd.DataFrame(bool_dict)

In [147]:
# df["annex_3"].notna()

In [148]:
# df_bool_h = boolean_df(df[df["annex_3"].notna()]["annex_3"], unique_items_a_3)
# df_bool_d = boolean_df(df[df['annex_4_a'].notna()]["annex_4_a"], unique_items_a_4_a)
# df_bool_r = boolean_df(df[df["annex_4_b"].notna()]["annex_4_b"], unique_items_a_4_b)
# df_bool_un = boolean_df(df[df["UN_code"].notna()]["UN_code"], unique_items_un)

In [149]:
# df_bool_h

In [150]:
# df_bool_un

In [151]:
# # only using un code for now
# df = pd.concat([df[["country", "country_of_destination", "year", "amount"]], df_bool_un], axis=1)
# df.reset_index(drop=True, inplace=True)
# # df[df.columns.intersection([*unique_items_a_3, *unique_items_a_4_a, *unique_items_a_4_b])] = df[df.columns.intersection([*unique_items_a_3, *unique_items_a_4_a, *unique_items_a_4_b])].fillna(np.nan)

## Geocoding

In [152]:
# load_dotenv()
# TOKEN=os.getenv("MAPBOX_TOKEN")

In [153]:
# headers = {'Accept': 'application/json'}
# dct = {}
# countries = list(df.country.unique())
# for i in countries:
#     url = f"https://api.mapbox.com/geocoding/v5/mapbox.places/{i}.json?&types=country&access_token={TOKEN}"
#     r = requests.get(url)
#     jason = r.json()
#     dct[i] = jason["features"][0]["center"]


In [154]:
# def func(x):
#     lat = dct[x][0]
#     lon = dct[x][1]
#     return lat,lon
#
#
# df["origin_lat"],df["origin_lon"] = df.apply(lambda x: func(x["country"]), axis=1)
# df["destination_lat"],df["destination_lon"] = df.apply(lambda x: func(x["country_of_destination"]), axis=1)

In [155]:
# for i in countries:
#     df.loc[df["country"] == i, "lat_origin"]=dct[i][1]
#     df.loc[df["country"] == i, "lon_origin"]=dct[i][0]
#     df.loc[df["country_of_destination"] == i, "lat_destination"]=dct[i][1]
#     df.loc[df["country_of_destination"] == i, "lon_destination"]=dct[i][0]

## Renaming columns

In [156]:
# df

In [157]:
# df.to_csv("../output/processed/clean.csv", index=False)