In [4]:
from nltk.corpus import stopwords
from ftfy import fix_text
import spacy
import re
import json
import pickle
import numpy as np
import pandas as pd

import os

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer

In [3]:
class TextCleaner:
    """Text Cleaner for speech data"""

    _SUPPORTED_LANGUAGES = {
        "en": "english",
    }
    _LANG_SPACY_MODEL_SOURCE = {
        "english": "en_core_web_sm",
    }
    # We will load the spacy models lazly once we find out we need them,
    # to make sure we don't load in the memory anything not needed.
    _nlp_objects = {}


    def __init__(self, lang_code: str):
        if lang_code not in self._SUPPORTED_LANGUAGES:
            raise ValueError(f"{lang_code} is not suported")

        lang = self._SUPPORTED_LANGUAGES[lang_code]
        assert (
            lang in self._LANG_SPACY_MODEL_SOURCE
        ), "Spacy model source has not be defined for supported language. Please set a the source to `_LANG_SPACY_MODEL_SOURCE`."

        if lang not in self._nlp_objects:
            self._nlp_objects[lang] = spacy.load(self._LANG_SPACY_MODEL_SOURCE[lang])

        self.lang = lang
        self._nlp_spacy_model = self._nlp_objects[lang]
        self._stopwords = stopwords.words(lang)


    def _fix_encoding(self, text):
        return fix_text(text)


    def _remove_non_alpha_non_latin(self, text):
        new_text = text
        new_text = re.sub(
            r"[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]",
            "",
            new_text,
        )
        textParts = [
            w for w in new_text.split() if w.isalpha()
        ]  # keep if contains at least one alpha
        return " ".join(textParts)

    def _remove_new_line(self, text):
        return re.sub(r"\u2028|\n", " ", text)

    def _remove_small_words(self, text):
        return re.sub(r"\b\w{1,2}\b", "", text)  # less than 3 char remove

    def _remove_urls(self, text):
        # https://stackoverflow.com/a/3809435/6142020
        url_regex = r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
        return re.sub(url_regex, "", text)

    def _remove_punctuation(self, text):
        new_text = text
        # Replace hyphen(-) ndash(–) mdash(—) with a space
        new_text = re.sub(re.compile("—|-|–"), " ", new_text)
        # Keep only words & spaces
        new_text = re.sub(r"[^\w\s]", "", new_text)
        return new_text
    
    def _lower_text(self, text: str) -> str:
        return text.lower()

    def _remove_stop_words(self, text):
        textParts = [w for w in text.split(" ") if w.lower() not in self._stopwords]
        return " ".join(textParts)


    def _get_parts_of_speech(self, document, text, tags):
        return [token.text for token in document if token.pos_ in tags]

    
    def _basic_clean(self, text):
        new_text = text
        new_text = self._remove_urls(new_text)
        new_text = self._remove_stop_words(new_text)
        new_text = self._remove_new_line(new_text)
        new_text = self._fix_encoding(text)
        new_text = self._remove_punctuation(new_text)
        new_text = self._remove_small_words(new_text)
        new_text = self._lower_text(new_text)
        return new_text


    def _clean_all_but_parts_of_speech(self, text, tags):
        """Keeps only the defined tags/parts of speech

        Args:
            text (str): The text to clean
            tags (list): Parts/Tags of speech

        Returns:
            str: The clean text
        """
        new_text = self._basic_clean(text)
        if tags:
            document = self._nlp_spacy_model(new_text, disable=["ner", "parser"])
            new_text = " ".join(self._get_parts_of_speech(document, new_text, tags))
        return self._lower_text(new_text)

    def clean(self, text: str, mode: str) -> str:
        """Cleans the provided text based on the mode.

        Args:
            text (str): The text to clean
            mode (str): Any of "basic" or "advanced"

        Returns:
            str: The cleaned text
        """

        cleaners = {
            "advanced": lambda text: self._clean_all_but_parts_of_speech(
                text, ["NOUN", "VERB", "PROPN", "SYM"]
            ),
            "basic": lambda text: self._basic_clean(text),
        }

        cleaned_text = cleaners[mode](text)
        return cleaned_text

In [5]:
sessions = np.arange(25, 76)
data=[]

for session in sessions:
    directory = "./TXT/Session "+str(session)+" - "+str(1945+session)
    for filename in os.listdir(directory):
        f = open(os.path.join(directory, filename))
        if filename[0]==".": #ignore hidden files
            continue
        splt = filename.split("_")
        data.append([session, 1945+session, splt[0], f.read()])

        
df_speech = pd.DataFrame(data, columns=['Session','Year','ISO-alpha3 Code','Speech'])

In [6]:
df_speech.head()

Unnamed: 0,Session,Year,ISO-alpha3 Code,Speech
0,25,1970,AUT,155.\t May I begin by expressing to Ambassado...
1,25,1970,MEX,"33.\t Mr. President, I take great pleasure in..."
2,25,1970,COG,122.\t I cannot begin my intervention without...
3,25,1970,DZA,1. The delegation of Algeria is very pleased ...
4,25,1970,LKA,"176.\t Mr. President, the delegation of Ceylon..."


In [9]:
cleaner = TextCleaner(lang_code="en")
df_speech['Cleaned_Speech'] = df_speech['Speech'].apply(lambda x: cleaner.clean(text=x, mode="advanced"))

In [10]:
df_speech

Unnamed: 0,Session,Year,ISO-alpha3 Code,Speech,Cleaned_Speech
0,25,1970,AUT,155.\t May I begin by expressing to Ambassado...,begin expressing ambassador hambro behalf dele...
1,25,1970,MEX,"33.\t Mr. President, I take great pleasure in...",president take pleasure following tradition co...
2,25,1970,COG,122.\t I cannot begin my intervention without...,begin intervention referring tribute represent...
3,25,1970,DZA,1. The delegation of Algeria is very pleased ...,delegation algeria see session assembly held p...
4,25,1970,LKA,"176.\t Mr. President, the delegation of Ceylon...",president delegation ceylon see preside sessio...
...,...,...,...,...,...
8476,75,2020,HRV,"Mr President, Excellencies\nAll protocol obser...",president excellencies protocol observed year ...
8477,75,2020,GAB,"Mr. President, Majesties,\nLadies and Gentleme...",president majesties ladies gentlemen heads sta...
8478,75,2020,MCO,"Mr. President of the General Assembly,\nMr. Se...",president general assembly secretary ladies ge...
8479,75,2020,AND,"Mr. President,\nMr. Secretary General,\nYour E...",president secretary excellencies ladies gentle...


In [11]:
df_speech.to_csv('cleaned_model_speeches.csv')