In [1]:
import pandas as pd

text = pd.read_csv('fta_clean.csv')
data = pd.read_csv('trade_clean.csv')
text = text.drop(138)
text = text.set_index('Identifier')
text = text.drop(['Text', 'Articles'], axis=1)

In [2]:
# Set consistent column labels and drop irrelevant features

data = data.rename(columns={'year': 'Year', 'iso3_o': 'Party 1', 'iso3_d': 'Party 2'})
data = data.drop(['iso3num_o',
       'iso3num_d', 'country_exists_o', 'country_exists_d',
       'gmt_offset_2020_o', 'gmt_offset_2020_d', 'contig', 'dist', 'distw',
       'distcap', 'distwces', 'dist_source', 'comlang_off', 'comlang_ethno',
       'comcol', 'comrelig', 'col45', 'legal_old_o', 'legal_old_d',
       'legal_new_o', 'legal_new_d', 'comleg_pretrans', 'comleg_posttrans',
       'transition_legalchange', 'heg_o', 'heg_d', 'col_dep_ever', 'col_dep',
       'col_dep_end_year', 'col_dep_end_conflict', 'empire', 'sibling_ever',
       'sibling', 'sever_year', 'sib_conflict', 'pop_o', 'pop_d', 'gdp_o',
       'gdp_d', 'gdpcap_o', 'gdpcap_d', 'pop_source_o', 'pop_source_d',
       'gdp_source_o', 'gdp_source_d',
       'gdpcap_ppp_o', 'gdpcap_ppp_d', 'pop_pwt_o', 'pop_pwt_d',
       'gdp_ppp_pwt_o', 'gdp_ppp_pwt_d', 'gatt_o', 'gatt_d', 'wto_o', 'wto_d',
       'eu_o', 'eu_d', 'rta_coverage', 'rta_type', 'entry_cost_o',
       'entry_cost_d', 'entry_proc_o', 'entry_proc_d', 'entry_time_o',
       'entry_time_d', 'entry_tp_o', 'entry_tp_d', 'tradeflow_comtrade_o',
       'tradeflow_comtrade_d',
       'tradeflow_imf_o', 'tradeflow_imf_d'], axis=1)

In [3]:
# Calculate years trade agreement was in effect

text['Date inactive'] = text['Date inactive'].fillna('2021-01-01')
text['Date into force'] = pd.to_datetime(text['Date into force']).dt.year
text['Date inactive'] = pd.to_datetime(text['Date inactive']).dt.year

In [4]:
# Drop irrelevant features in trade agreements

text = text.drop(['Date signed', 'Date of notification', 'End of implementation', 'Composition'], axis=1)

In [5]:
# Import supervised and unsupervised results

supervised = pd.read_csv('../Data Labelling/machine_labelled.csv', index_col='Identifier')
unsupervised = pd.read_csv('unsupervised.csv', index_col='Identifier')
supervised = supervised.drop(171)
supervised = supervised.drop('Articles', axis=1)
unsupervised = unsupervised.drop(171)

In [6]:
# Merge all features generated and 

features = pd.concat([text,supervised,unsupervised], axis=1)
features['Year'] = features.apply(lambda x: [i for i in range(x['Date into force'], x['Date inactive'])], axis=1)
features = features.explode('Year')

In [7]:
merged = features.merge(data, on=['Year', 'Party 1', 'Party 2'])
merged['Pair'] = merged.apply(lambda x: (x['Party 1'], x['Party 2']), axis=1)
merged['Relative GDP'] = abs(merged['gdp_ppp_o']-merged['gdp_ppp_d'])/(merged['gdp_ppp_o']+merged['gdp_ppp_d'])
merged = merged.drop(['Date into force', 'Date inactive', 'Party 1', 'Party 2', 'gdp_ppp_o', 'gdp_ppp_d'], axis=1)

In [8]:
merged.to_csv('merged.csv', index=False)