In [1]:
import numpy as np
import pandas as pd
import os
import xml.etree.ElementTree as ET

# get list of file names
files = [os.path.join('tota-master', 'xml', path) for path in os.listdir(os.path.join('tota-master', 'xml'))]

columns = ['Name', 'Type', 'WTO ID', 'Identifier', 'Date signed', 'Date into force', 'Date of notification', 'End of implementation', 'Date inactive', 'Original parties', 'Current parties', 'Composition', 'Region', 'All WTO', 'Cross Regional', 'Language', 'Text', 'Articles']
df = pd.DataFrame(columns=columns)

# populate data frame
for (idx, file) in enumerate(files):
    root = ET.parse(file).getroot()
    meta = root[0]
    body = root[1]
    
    name = meta.find('name').text
    _type = meta.find('type').text
    _id = meta.find('wto_rta_id').text
    identifier = meta.find('treaty_identifier').text
    date_signed = meta.find('date_signed').text
    date_into_force = meta.find('date_into_force').text
    date_notification = meta.find('date_notification').text
    end_implementation = meta.find('end_implementation').text
    date_inactive = meta.find('date_inactive').text
    original_parties = [party.text for party in meta.findall('./parties_original/partyisocode')]
    current_parties = [party.text for party in meta.findall('./parties/partyisocode')]
    composition = meta.find('composition').text
    region = meta.find('region').text
    all_wto = meta.find('parties_wto').text
    cross_regional = meta.find('crossregional').text
    language = meta.find('language').text
    text = ''.join([article.text for article in body.findall('.//chapter/article')])
#     chapter_names = [chapter.attrib['name'] if 'name' in chapter.attrib else 'Untitled' for chapter in body.findall('chapter')]
#     chapter_article_count =  [len(chapter.findall('article')) for chapter in body.findall('chapter')]
#     chapter_id = [chapter.attrib['chapter_identifier'] for chapter in body.findall('chapter')]
    articles = [article.text for article in body.findall('.//chapter/article')]
#     article_names = [article.attrib['name'] if 'name' in article.attrib else 'Untitled' for article in body.findall('.//chapter/article')]
#     article_id = [article.attrib['article_identifier'] for article in body.findall('.//chapter/article')]
    df.loc[idx] = [name, _type, _id, identifier, date_signed, date_into_force, date_notification, end_implementation, date_inactive, original_parties, current_parties, composition, region, all_wto, cross_regional, language, text, articles]
    
# sort data frame chronologically
df.sort_values(by='Date signed', inplace=True)

In [2]:
# only look at English, bilateral trade agreements
df_new = df.loc[(df["Language"] == "en") & (df["Composition"] == "Bilateral")]

In [3]:
# set the index to the identifier as on GitHub
df_new = df_new.set_index("Identifier")
df_new.index = df_new.index.astype('int')

In [4]:
# search for anomalies
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_new)

                                                         Name  \
Identifier                                                      
310            South Africa - Southern Rhodesia Customs Union   
254                   El Salvador - Nicaragua Free Trade Area   
266                       Ghana - Upper Volta Trade Agreement   
174              Australia - New Zealand Free Trade Agreement   
274                  Ireland - United Kingdom Free Trade Area   
126                     Australia - Papua New Guinea (PATCRA)   
122                        Australia - New Zealand (ANZCERTA)   
121                                               US - Israel   
186                 Canada - US Free Trade Agreement (CUSFTA)   
120               Lao People's Democratic Republic - Thailand   
262                                Finland - Estonia Protocol   
277                      Latvia - Sweden Free Trade Agreement   
280                   Lithuania - Sweden Free Trade Agreement   
259                     E

In [5]:
# drop these agreements because they include the former country of Serbia and Montenegro which was split
df_new = df_new.drop([286, 295])

In [6]:
# set new columns for party 1 and party 2
df_new["Party 1"] = df_new["Original parties"].str[0]
df_new["Party 2"] = df_new["Original parties"].str[1]

In [7]:
# drop old columns
df_new = df_new.drop("Original parties", axis=1)
df_new = df_new.drop("Current parties", axis=1)

In [8]:
# generate variable for total articles
df_new['Total articles'] = df_new['Articles'].str.len()

In [9]:
# generate variable for total words
df_new['Total words'] = df_new['Articles'].apply(lambda x: ','.join(map(str, x))).str.len()

In [10]:
# generate variable for number of unique words
df_new['Number of unique words'] = df_new['Articles'].apply(lambda x: ','.join(map(str, x))).str.split(' ', expand=True).apply(lambda x: list(set(x)), axis=1).str.len()

In [11]:
# export dataframe
df_new.to_csv("fta_clean.csv")

In [12]:
# export dataframe containing all articles
articles = df_new[['Articles']].explode('Articles')
articles.to_csv('articles.csv')