In [6]:
import os
import numpy as np
import pandas as pd

sessions = np.arange(25, 76)
data = []

# Load speeches
for session in sessions:
    directory = "./TXT/Session " + str(session) + " - " + str(1945 + session)
    for filename in os.listdir(directory):
        f = open(os.path.join(directory, filename))
        if filename[0] == ".": #ignore hidden files
            continue
        splt = filename.split("_")
        data.append([session, 1945 + session, splt[0], f.read()])


df_speech = pd.DataFrame(data, columns=['Session','Year','ISO-alpha3 Code','Speech'])

In [7]:
# Load country codes
df_codes = pd.read_csv("UNSD — Methodology.csv", delimiter=';')

In [8]:
# Load climate related json file
climate = pd.read_json('data/climate-fever-dataset-r1.jsonl', lines=True) # Source https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html
climate['claim'] = climate['claim'].map(str)
climate

Unnamed: 0,claim_id,claim,claim_label,evidences
0,0,Global warming is driving polar bears toward e...,SUPPORTS,[{'evidence_id': 'Extinction risk from global ...
1,5,The sun has gone into ‘lockdown’ which could c...,SUPPORTS,"[{'evidence_id': 'Famine:386', 'evidence_label..."
2,6,The polar bear population has been growing.,REFUTES,"[{'evidence_id': 'Polar bear:1332', 'evidence_..."
3,9,Ironic' study finds more CO2 has slightly cool...,REFUTES,"[{'evidence_id': 'Atmosphere of Mars:131', 'ev..."
4,10,Human additions of CO2 are in the margin of er...,REFUTES,[{'evidence_id': 'Carbon dioxide in Earth's at...
...,...,...,...,...
1530,3125,About 60% of the warming observed from 1970 to...,NOT_ENOUGH_INFO,"[{'evidence_id': 'Climate variability:103', 'e..."
1531,3127,"""Skeptics hope that Postma’s alternative therm...",NOT_ENOUGH_INFO,"[{'evidence_id': 'Meteorology:4', 'evidence_la..."
1532,3130,"""There are other possible causes for climate c...",SUPPORTS,[{'evidence_id': 'Attribution of recent climat...
1533,3131,We don't need a high heat flow - just a high t...,NOT_ENOUGH_INFO,"[{'evidence_id': 'Earth:161', 'evidence_label'..."


In [9]:
df_un_merged = df_speech.merge(df_codes)

# Reorder
df_un_merged[["Country or Area", "Region Name", "Sub-region Name", \
    "ISO-alpha3 Code", "Least Developed Countries (LDC)", "Session", "Year", "Speech"]]

# Reindex
df_un_merged = df_un_merged.set_index(["Year", "ISO-alpha3 Code"])

df_un_merged = df_un_merged[:1000] # Small version for now
df_un_merged.shape

(1000, 16)

In [10]:
from nltk import word_tokenize
from nltk.corpus import stopwords

def preprocess(words):
    sw = stopwords.words("english")
    clean = []
    for w in words:
        w = str.lower(w)
        if (w not in sw) and (w.isalpha()):
            clean.append(w)
    return clean

df_un_merged['Tokenized'] = df_un_merged['Speech'].apply(word_tokenize) # Takes long
df_un_merged['Clean'] = df_un_merged['Tokenized'].apply(preprocess)