In [1]:
import os
import numpy as np
import pandas as pd


# UN Example Links

- [Baturo 2017 paper](https://arxiv.org/pdf/1708.05873.pdf)
- [Medium.com - UN EDA](https://medium.com/@anushkocharyan/nlp-analysis-of-50-years-of-united-nations-general-debate-speeches-61dc3bed3c11)
- [Github - Medium.com UN EDA](https://github.com/anushkocharyan/NLP_Analysis_on_UN_speeches)
- [Kaggle - UN India EDA](https://www.kaggle.com/someadityamandal/analysis-of-india-at-un-debates)
- [Github - Random Guy](https://github.com/nicolasdz/UNGDC/blob/main/UNGDC%204%2C0.ipynb)

# NLP Technical Links
- [Medium.com - Text preprocessing](https://medium.com/@datamonsters/text-preprocessing-in-python-steps-tools-and-examples-bf025f872908)
- [Stemming-Lemmatization](https://www.guru99.com/stemming-lemmatization-python-nltk.html)



# Import Data

In [2]:
# import country codes
df_code = pd.read_csv('UNSD — Methodology.csv')


In [3]:
# import speeches

sessions = np.arange(25, 76)
data=[]

for session in sessions:
    directory = "./TXT/Session "+str(session)+" - "+str(1945+session)
    
    for filename in os.listdir(directory):
        f = open(os.path.join(directory, filename))
        
        if filename[0]==".": #ignore hidden files
            continue
        splt = filename.split("_")
        data.append([session, 1945+session, splt[0], f.read()])

df_speech = pd.DataFrame(data, columns=['Session','Year','ISO-alpha3 Code','Speech'])

In [4]:
# import happiness index

df_happiness = pd.read_excel('DataPanelWHR2021C2.xls')
df_happiness.rename(columns={'year':'Year'}, inplace=True) # for merge later


# Inspect Data

df_codes and df_speech merged on **ISO-alpha3 Code**

**What alpha3 codes are found in df_speech but NOT df_code?**

- EU = European Union (since 2011)
- DDR = German Democratic Republic --> now Germany
- POR = incorrect ~ Portugal (checked speech)
- YDYE = The Democratic Yemen --> now Yemen
- CSK = Czechoslovakia --> divided into Czechia (CZE), and Slovakia (SVK)
- YUG = Yugoslavia --> divided into Bosnia and Herzegovina (BA), Croatia (HR), the former Yugoslav Republic of Macedonia (MK), Serbia and Montenegro (CS), Slovenia (SI)

**Actions:**
- DDR to DEU (Germany)
- POR to PRT (Portugal)
- YDYE to YEM (Yemen)

Unsure what to do with EU, CSK, and YUG

In [5]:
# # alpha3 codes: in df_speech but not in df_code
# boolean = df_speech['ISO-alpha3 Code'].isin(df_code['ISO-alpha3 Code'])
# alpha3_wo_code = df_speech[~boolean]['ISO-alpha3 Code'].unique()

alpha_replace = {'POR':'PRT', 
                 'YDYE':'YEM',
                 'DDR':'DEU'}

df_speech['ISO-alpha3 Code'].replace(alpha_replace, inplace=True) # update to pair with df_code


df_merge and df_happiness merged on **Country or Area -- Country name**

**What Country Names are found in df_happiness but NOT df_code?**
- df_happiness doesn't incldue all countries
- Some countries in df_happiness aren't part of UN 

**Actions:**
- see replacement dictionary

In [6]:
# # country names: in df_happiness but not in df_code
# boolean = df_happiness['Country name'].isin(df_code['Country or Area'])
# country_wo_happiness = df_happiness[~boolean]['Country name'].unique()

country_replace = {'Bolivia':'Bolivia (Plurinational State of)',
                   'Congo (Brazzaville)':'Congo',
                   'Congo (Kinshasa)':'Democratic Republic of the Congo',
                   'Czech Republic':'Czechia',
                   'Hong Kong S.A.R. of China':'China - Hong Kong Special Administrative Region',
                   'Iran':'Iran (Islamic Republic of)',
                   'Ivory Coast':'Côte d’Ivoire',
                   'Laos':"Lao People's Democratic Republic",
                   'Moldova':'Republic of Moldova',
                   'North Cyprus':'Cyprus',
                   'Palestinian Territories':'State of Palestine',
                   'Russia':'Russian Federation',
                   'Somaliland region':'Somalia',
                   'South Korea':'Republic of Korea',
                   'Swaziland':'Eswatini',
                   'Syria':'Syrian Arab Republic',
                   'Tanzania':'United Republic of Tanzania',
                   'United Kingdom':'United Kingdom of Great Britain and Northern Ireland',
                   'United States':'United States of America',
                   'Venezuela':'Venezuela (Bolivarian Republic of)',
                   'Vietnam':'Viet Nam'}

df_happiness.replace(country_replace, inplace=True) # update to pair with df_code


In [7]:
# merge three datasets
df_merge = pd.merge(df_code, df_speech, 
                    how='right', on='ISO-alpha3 Code')

df = pd.merge(df_merge, df_happiness,
              how='left',
              left_on=['Country or Area','Year'],
              right_on=['Country name','Year']).drop('Country name', axis=1) # drop Country name (redundant)

# Process Speeches

In [40]:
%%time

from nltk.corpus import stopwords
from nltk import word_tokenize
import re

def preprocess(s):
    '''
    input: s, speech string
    output: tokenized list (wo stop words)
    '''
    sw = stopwords.words("english") # stop words
    
    s = s.lower() # normalize lower case
    s = re.sub('united nations','united_nations',s) # combine to avoid tokenization splitting
    s = re.sub('\s+',' ',s) # remove tabs and new lines
    s = re.sub(r'[^a-zA-Z_ ]', '', s) # remove punctuation/non-alpha
    s = re.sub(r'  +',' ', s) # remove 2+ spaces
    s = s.strip() # remove leading-trailing spaces
    
    words = word_tokenize(s) # tokenize processed string
    words = [word for word in words if (word not in sw)] # remove stop words

    return words

df['speech_processed'] = df['Speech'].apply(preprocess)

CPU times: user 1min 48s, sys: 1.66 s, total: 1min 50s
Wall time: 1min 51s


**Stemming**

In [41]:
%%time

from nltk.stem import PorterStemmer

pstemmer = PorterStemmer()
find_stem = lambda words: [pstemmer.stem(word) for word in words]
df['speech_stem'] = df['speech_processed'].apply(find_stem)

CPU times: user 3min 3s, sys: 2.03 s, total: 3min 5s
Wall time: 3min 7s


**Lemmatization**

In [42]:
%%time

from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()
find_lemma = lambda words: [wnl.lemmatize(word) for word in words]
df['speech_lemma'] = df['speech_processed'].apply(find_stem)


CPU times: user 3min 3s, sys: 2.01 s, total: 3min 5s
Wall time: 3min 7s
