# Importing and cleaning of speeches

In [1]:
import requests
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
import csv
import matplotlib.pyplot as plt

### Preparing csv file with urls of speeches for downloading

In [2]:
stop_words=stopwords.words('english')
stop_words.extend(["ad","quot","ca"])
nltk.download('words')
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to
[nltk_data]     /Users/ignacyklimont/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [3]:
# To open the csv file with the cleaned speeches use 
# with open('speeches_clean.csv', newline = '') as file:
#     reader = csv.reader(file)
#     "name" = list(reader)

with open('speeches_url.csv', newline='') as file:
    reader = csv.reader(file)
    data = list(reader)
data = data[28:len(data)-1] # Start with first victory speech, last entry is not a speech

In [4]:
# Remove unnecessary items from the list, result = ['title', 'url', 'date'] for each speech
for i in reversed(range(len(data))):
    data[i] = [x for x in data[i] if not x == '' if not 'mp3' in x if not 'pdf' in x if not 'PDF' in x]
    if len(data[i]) != 3:
        data.remove(data[i])

### Downloading speeches using urls

In [None]:
# Get the speeches from the website (this step takes 3-5 min)

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:25.0) Gecko/20100101 Firefox/25.0'}

speeches_raw = list()
for i in range(len(data)):
    r = requests.get(data[i][1], headers = headers)
    speeches_raw.append((data[i][0],data[i][2],r.text))
    

### Cleaning speeches

In [None]:
def cleaning_speeches(speeches):
    # Remove html code, punctuation and white spaces
    speeches = speeches.lower()
    speeches = re.sub('<.*>', '', speeches)
    speeches = re.sub('\((.*?\))', '', speeches)
    speeches = re.sub('\[.*?\]', '', speeches)
    speeches = re.sub('[%s]' % re.escape(string.punctuation), ' ', speeches)
    speeches = re.sub('\w*\d\w*', '', speeches)
    speeches = re.sub('\n*\r*\t*', '', speeches)
    # Remove stop words
    tokenized_speeches=word_tokenize(speeches)
    speeches_wo_stopwords= [w for w in tokenized_speeches if not w in stop_words] 
    speeches_wo_stopwords=' '.join(speeches_wo_stopwords) 
    # Remove non-English words
    speeches_wo_nonwords = [w for w in nltk.wordpunct_tokenize(speeches_wo_stopwords) if w.lower() in words or not w.isalpha()]
    speeches_wo_nonwords = ' '.join(speeches_wo_nonwords)
    return speeches_wo_nonwords

In [None]:
speeches_clean = list()
for i in range(0,len(speeches_raw)):
    clean_txt = cleaning_speeches(speeches_raw[i][2])
    speeches_clean.append((speeches_raw[i][0],speeches_raw[i][1],clean_txt[180:-180])) #remove the first and last few words, since they are not part of the speeches (unfortunately, not the same amount for each speech though)

In [None]:
speeches_clean[0][2]

In [None]:
df = pd.DataFrame(speeches_clean)
df.columns = ['Title', 'Date', 'Text']

In [None]:
df.to_csv('speeches_clean.csv', index=False, header='')