In [15]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
from requests import RequestException
import csv
import re
import json
import copy
import ast
import operator
import math
import time
import zipfile as zf
from collections import Counter

# Load trade data from OrganizeTrade2

In [16]:
trade = pd.read_csv('../Data/3_Trade/improvedmapping_organized_trade.csv')

trade['year']=trade['year'].astype(int)
trade

Unnamed: 0,year,importer,product_name,HS_code,quantity_sum,mean_emissions
0,2002,36,Additives/Blending Components,350400,8955.079,3.278981e+11
1,2002,36,Additives/Blending Components,381111,221.165,2.285408e+10
2,2002,36,Additives/Blending Components,381119,475.349,6.512156e+10
3,2002,36,Additives/Blending Components,381121,23242.566,9.728533e+11
4,2002,36,Additives/Blending Components,381129,1300.913,4.433476e+10
...,...,...,...,...,...,...
1534387,2010,876,products of Vegetable oils and fats,150890,0.400,1.845110e+08
1534388,2010,876,products of Vegetable oils and fats,150910,16.900,7.795588e+09
1534389,2010,876,products of Vegetable oils and fats,150990,0.400,1.845110e+08
1534390,2010,876,products of Vegetable oils and fats,151219,35.500,1.637535e+10


In [17]:
country_nos = pd.read_csv('../Data/3_Trade/country_codes_V202102.csv', encoding='latin-1')
countrynumber_dict = pd.Series(country_nos.iso_3digit_alpha.values,index=country_nos.country_code).to_dict()
countrynumber_dict
trade['importer'] = trade['importer'].map(countrynumber_dict)
trade

Unnamed: 0,year,importer,product_name,HS_code,quantity_sum,mean_emissions
0,2002,AUS,Additives/Blending Components,350400,8955.079,3.278981e+11
1,2002,AUS,Additives/Blending Components,381111,221.165,2.285408e+10
2,2002,AUS,Additives/Blending Components,381119,475.349,6.512156e+10
3,2002,AUS,Additives/Blending Components,381121,23242.566,9.728533e+11
4,2002,AUS,Additives/Blending Components,381129,1300.913,4.433476e+10
...,...,...,...,...,...,...
1534387,2010,WLF,products of Vegetable oils and fats,150890,0.400,1.845110e+08
1534388,2010,WLF,products of Vegetable oils and fats,150910,16.900,7.795588e+09
1534389,2010,WLF,products of Vegetable oils and fats,150990,0.400,1.845110e+08
1534390,2010,WLF,products of Vegetable oils and fats,151219,35.500,1.637535e+10


In [18]:
# number of unique importers in my model
trade['importer'].nunique()

41

# Merge IQ

In [20]:
iq = pd.read_csv('../Data/IQs_of_only_clustered_countries.csv', encoding='latin-1')
iq

Unnamed: 0,wbcode,country,year,legal_abs,political_abs,economic_abs,cluster
0,AGO,Angola,2002,0.412211,0.281626,0.393893,1.0
1,AGO,Angola,2003,0.428805,0.297034,0.430300,1.0
2,AGO,Angola,2004,0.425183,0.285338,0.361373,1.0
3,AGO,Angola,2005,0.387639,0.296077,0.311552,1.0
4,AGO,Angola,2006,0.386354,0.321154,0.384535,1.0
...,...,...,...,...,...,...,...
1120,ZWE,Zimbabwe,2006,0.250514,0.253429,0.178055,1.0
1121,ZWE,Zimbabwe,2007,0.248133,0.241630,0.176746,1.0
1122,ZWE,Zimbabwe,2008,0.258128,0.225858,0.185948,1.0
1123,ZWE,Zimbabwe,2009,0.269821,0.298969,0.229839,1.0


In [21]:
merged_df = pd.merge(trade, iq,  how='left', left_on=['year','importer'], right_on = ['year','wbcode'])

In [22]:
merged_df

Unnamed: 0,year,importer,product_name,HS_code,quantity_sum,mean_emissions,wbcode,country,legal_abs,political_abs,economic_abs,cluster
0,2002,AUS,Additives/Blending Components,350400,8955.079,3.278981e+11,AUS,Australia,0.935851,0.864046,0.78082,5.0
1,2002,AUS,Additives/Blending Components,381111,221.165,2.285408e+10,AUS,Australia,0.935851,0.864046,0.78082,5.0
2,2002,AUS,Additives/Blending Components,381119,475.349,6.512156e+10,AUS,Australia,0.935851,0.864046,0.78082,5.0
3,2002,AUS,Additives/Blending Components,381121,23242.566,9.728533e+11,AUS,Australia,0.935851,0.864046,0.78082,5.0
4,2002,AUS,Additives/Blending Components,381129,1300.913,4.433476e+10,AUS,Australia,0.935851,0.864046,0.78082,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1534387,2010,WLF,products of Vegetable oils and fats,150890,0.400,1.845110e+08,,,,,,
1534388,2010,WLF,products of Vegetable oils and fats,150910,16.900,7.795588e+09,,,,,,
1534389,2010,WLF,products of Vegetable oils and fats,150990,0.400,1.845110e+08,,,,,,
1534390,2010,WLF,products of Vegetable oils and fats,151219,35.500,1.637535e+10,,,,,,


In [23]:
df_cluster = merged_df[merged_df['cluster'].notnull()]
df_cluster

Unnamed: 0,year,importer,product_name,HS_code,quantity_sum,mean_emissions,wbcode,country,legal_abs,political_abs,economic_abs,cluster
0,2002,AUS,Additives/Blending Components,350400,8955.079,3.278981e+11,AUS,Australia,0.935851,0.864046,0.780820,5.0
1,2002,AUS,Additives/Blending Components,381111,221.165,2.285408e+10,AUS,Australia,0.935851,0.864046,0.780820,5.0
2,2002,AUS,Additives/Blending Components,381119,475.349,6.512156e+10,AUS,Australia,0.935851,0.864046,0.780820,5.0
3,2002,AUS,Additives/Blending Components,381121,23242.566,9.728533e+11,AUS,Australia,0.935851,0.864046,0.780820,5.0
4,2002,AUS,Additives/Blending Components,381129,1300.913,4.433476e+10,AUS,Australia,0.935851,0.864046,0.780820,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1533926,2010,USA,products of Vegetable oils and fats,230620,5849.003,8.988679e+11,USA,United States,0.815115,0.754123,0.776533,5.0
1533927,2010,USA,products of Vegetable oils and fats,230630,19.950,1.553387e+10,USA,United States,0.815115,0.754123,0.776533,5.0
1533928,2010,USA,products of Vegetable oils and fats,230650,102.087,2.773782e+09,USA,United States,0.815115,0.754123,0.776533,5.0
1533929,2010,USA,products of Vegetable oils and fats,230660,1.780,1.337787e+08,USA,United States,0.815115,0.754123,0.776533,5.0


In [24]:
df_cluster['importer'].nunique()

39

### save to file final.csv

In [27]:
merged_df.to_csv('../Data/final.csv')

### extra for self-information

In [28]:
merged_df['mean_emissions'].describe()

count    1.534392e+06
mean     1.692681e+11
std      2.018506e+13
min     -1.416532e+16
25%      8.289642e+05
50%      2.592104e+08
75%      4.546139e+09
max      9.431901e+15
Name: mean_emissions, dtype: float64

In [29]:
merged_df.head()

Unnamed: 0,year,importer,product_name,HS_code,quantity_sum,mean_emissions,wbcode,country,legal_abs,political_abs,economic_abs,cluster
0,2002,AUS,Additives/Blending Components,350400,8955.079,327898100000.0,AUS,Australia,0.935851,0.864046,0.78082,5.0
1,2002,AUS,Additives/Blending Components,381111,221.165,22854080000.0,AUS,Australia,0.935851,0.864046,0.78082,5.0
2,2002,AUS,Additives/Blending Components,381119,475.349,65121560000.0,AUS,Australia,0.935851,0.864046,0.78082,5.0
3,2002,AUS,Additives/Blending Components,381121,23242.566,972853300000.0,AUS,Australia,0.935851,0.864046,0.78082,5.0
4,2002,AUS,Additives/Blending Components,381129,1300.913,44334760000.0,AUS,Australia,0.935851,0.864046,0.78082,5.0


In [30]:
# number of trade products unique
merged_df['HS_code'].nunique() 

4519