# Webscraping the 2022 UN General Debates

This project was inspired by the paper by Kohei Watanabe and Yuan Zhou, "Theory-Driven Analysis of Large Corpora: Semisupervised Topic Classification of the UN Speeches," and a project I worked on in "Text as Data" (POLI 176), a course taught by Molly Roberts, at UCSD.  

Watanabe and Zhou's research only covers speeches up to 2017 but I wanted to see how well I could build a classifier that could predict future speeches. When I started this project, the 2022 UN General Debate speeches were only available in pdf format on the UN General Debate website, and I could not find the text data readily available in .csv format. So, I created a webscraper that could also be implemented for future speeches. 


In [1]:
import pandas as pd

#installing the requests library into this kernel 
import requests 
from bs4 import BeautifulSoup
import io

#for converting pdfs to txt
from PyPDF2 import PdfReader

from time import sleep

#helpful tool to see estimated scrape time
from tqdm import tqdm

In [None]:
pages = ['https://gadebate.un.org/en', 
        'https://gadebate.un.org/en/listbydate/2022-09-21', 
        'https://gadebate.un.org/en/listbydate/2022-09-22',
        'https://gadebate.un.org/en/listbydate/2022-09-23',
        'https://gadebate.un.org/en/listbydate/2022-09-24', 
        'https://gadebate.un.org/en/listbydate/2022-09-26']

all_countries = []
#wrapping pages in tqdm_notebook to create progress bar
for page in tqdm(pages):
    #r contains the html for the url
    r = requests.get(page)
    
    #create a BeautifulSoup object that uses an html parser 
    soup = BeautifulSoup(r.content, 'html.parser')

    #extracting links for each country's page and adding it to all_countries
    all_countries+= soup.select('.media-heading') #.select creates a list of all that falls under "class= 'media-heading'"
    
    #wait for 10 sec
    sleep(10)


In [None]:
all_countries

In [None]:
data = [] #list of dictionaries 
for country in all_countries:
    info_dict = dict()
    pre_link = country.select_one('a')['href']
    final_link = 'https://gadebate.un.org' + pre_link
    info_dict['Country'] = country.text
    info_dict['Country Link'] = final_link
    data.append(info_dict)


In [None]:
#extracting pdf link from each country link

#function to filter out pdf links based on whether its text says "Read the Statement in English"
def is_english(list_of_tags): 
    for tag in list_of_tags:
        try: 
            text = str(tag.text)
            if "English" in text: 
                return tag['href']
        except: 
            return 'Not a valid tag'
        
for i in tqdm(range(len(data))): 
    #print(data[i]['Country Link'])
    #get html for each link
    country_link = data[i]['Country Link']
    r2 = requests.get(country_link)
    #creating BeautifulSoup object
    soup2 = BeautifulSoup(r2.content, 'html.parser')
    #get date
    data[i]['Date'] = soup2.select_one('.statement-date').text
    #getting all pdf links if there is more than one language available
    list_of_tags = soup2.select('a[href^="https://gadebate.un.org/sites"]')
    #filtering out links to only get English link
    pdf_link = is_english(list_of_tags)
    data[i]['Pdf Link'] = pdf_link

In [None]:
df = pd.DataFrame(data)
df.head()

In [52]:
#dataframe function to apply to the links
def pdf_to_text(pdf_link): 
    try: 
        #print(pdf_link)
        response = requests.get(pdf_link)
        reader = PdfReader(io.BytesIO(response.content))
        speech = ''
        
        #need to loop through each page to remove page numbers
        for page in range(reader.getNumPages()):
            text_body = reader.getPage(page).extract_text().strip()
            print(text_body)
            
            #using replace to get rid of page number, for some reason it often appears at the top of the page
            #not perfect because there are a lot more edge cases than can be addressed when it comes to converting pdf text
            try:
                isinstance(int(text_body[0]), int)
                text_body = text_body.replace(str(page + 1), '', 1)
                speech += text_body
                
            #if a page number does not exist at the top of the page, we won't run replace
            except:
                print('hi')
                speech += text_body
                
        return speech
    except: 
        return "No link found"


In [None]:
tqdm.pandas()
df['Speech'] = df['Pdf Link'].progress_apply(pdf_to_text)

In [None]:
df.head()
df.shape

In [None]:
# #saving the dataframe
df.to_csv('2022_UN_General_Debates_unedited')

## data cleaning

In [3]:
df = pd.read_csv('2022_UN_General_Debates_unedited')
df.head()

Unnamed: 0.1,Unnamed: 0,Country,Country Link,Date,Pdf Link,Speech
0,0,Secretary-General of the United Nations,https://gadebate.un.org/en/77/secretary-genera...,20 September 2022,https://gadebate.un.org/sites/default/files/ga...,U N I T E D N A T I O N S ...
1,1,President of the General Assembly (opening),https://gadebate.un.org/en/77/president-genera...,20 September 2022,,No link found
2,2,Brazil,https://gadebate.un.org/en/77/brazil,20 September 2022,https://gadebate.un.org/sites/default/files/ga...,Statement by the President of the Republic of ...
3,3,Senegal,https://gadebate.un.org/en/77/senegal,20 September 2022,https://gadebate.un.org/sites/default/files/ga...,\n ...
4,4,Chile,https://gadebate.un.org/en/77/chile,20 September 2022,,No link found


In [4]:
df['Date'] = df['Date'].apply(lambda x: x.replace('\r\n', '').strip())
df.head()

Unnamed: 0.1,Unnamed: 0,Country,Country Link,Date,Pdf Link,Speech
0,0,Secretary-General of the United Nations,https://gadebate.un.org/en/77/secretary-genera...,20 September 2022,https://gadebate.un.org/sites/default/files/ga...,U N I T E D N A T I O N S ...
1,1,President of the General Assembly (opening),https://gadebate.un.org/en/77/president-genera...,20 September 2022,,No link found
2,2,Brazil,https://gadebate.un.org/en/77/brazil,20 September 2022,https://gadebate.un.org/sites/default/files/ga...,Statement by the President of the Republic of ...
3,3,Senegal,https://gadebate.un.org/en/77/senegal,20 September 2022,https://gadebate.un.org/sites/default/files/ga...,\n ...
4,4,Chile,https://gadebate.un.org/en/77/chile,20 September 2022,,No link found


In [75]:
#iso codes from: https://gist.github.com/tadast/8827699#file-countries_codes_and_coordinates-csv
iso_codes = pd.read_csv('countries_codes_and_coordinates.csv')
iso_codes = iso_codes[['Country', 'Alpha-3 code']]

#removing "" from ISO codes
iso_codes['Alpha-3 code'] = iso_codes['Alpha-3 code'].apply(lambda x: x.replace('"', ''))

#looking at the dataframe
iso_codes.head()

# some country names in df do not match up with iso_codes country names,
# so manually inputting ISO codes

df.loc[7, ('Country')] = 'Turkey'
df.loc[11, ('Country')] = 'Korea, Republic of'
df.loc[21, ('Country')] = 'Bolivia, Plurinational State of'
df.loc[28, ('Country')] = 'Congo, the Democratic Republic of the'
df.loc[36, ('Country')] = 'Iran, Islamic Republic of'
df.loc[41, ('Country')] = 'United States'
df.loc[47, ('Country')] = 'Moldova, Republic of'
df.loc[54, ('Country')] = "Côte d'Ivoire"
df.loc[58, ('Country')] = 'Cape Verde'
df.loc[59, ('Country')] = 'Swaziland'
df.loc[71, ('Country')] = 'United Kingdom'
df.loc[74, ('Country')] = 'Gambia'
df.loc[90, ('Country')] = 'Micronesia, Federated States of'
df.loc[92, ('Country')] = 'Tanzania, United Republic of'

#merging df and iso_codes
df_merged = df.merge(iso_codes, on='Country', indicator = True, how = 'outer')

#filtering out countries in iso_codes that did not match with countries in df 
df_merged = df_merged[df_merged['_merge'] != 'right_only']

#10 more entries missing ISO codes
df_merged[df_merged['Alpha-3 code'].isnull()].shape

df_merged.loc[110, ('Alpha-3 code')] = 'PSE'
df_merged.loc[111, ('Alpha-3 code')] = 'EU'
df_merged.loc[145, ('Alpha-3 code')] = 'MKD'
df_merged.loc[149, ('Alpha-3 code')] = 'VAT'
df_merged.loc[152, ('Alpha-3 code')] = 'LAO'
df_merged.loc[168, ('Alpha-3 code')] = 'VEN'
df_merged.loc[189, ('Alpha-3 code')] = 'PRK'

# only ones left are speeches by the UN Secretary-General and President of the General Assembly, 
# which will be excluded from the final dataset because they are not speeches by countries 
df_merged[df_merged['Alpha-3 code'].isnull()]['Country'].values
df_merged = df_merged.dropna(subset = ['Alpha-3 code', 'Pdf Link', 'Speech'])

#adding year
df_merged['year'] = '2022'

#There are now 122 speeches. Some speeches were images or in a format that PdfPy2 was unable to read 
df_merged.shape[0]
df_merged

#creating speech index in the form: ISO_SessionNumber_Year

def speech_index(x): 
     return x + '_77_2022'

df_merged['doc_id'] = df_merged['Alpha-3 code'].apply(lambda x: x + '_77_2022')

df_merged = df_merged.set_index('doc_id')

# #dropping unnecessary columns
# df_merged = df_merged.drop(['Unnamed: 0', '_merge'], axis = 1)
# df_merged

df_merged = df_merged.drop(['Unnamed: 0', 'Country', '_merge', 'Country Link', 'Pdf Link', '_merge', 'Date'], axis = 1)

#renaming ISO column 
df_merged = df_merged.rename(columns = {'Alpha-3 code':'country', 'Speech': 'speech'})

df_merged

df_merged.to_csv('2022_UN_General_Debates')
