### Clean raw html data
Notebook ```scrape-data.ipynb``` is used for scraping raw html data.

Arguments are:
- YYYYY, XX = 20121, 105
- YYYYY, XX = 20131, 109
- YYYYY, XX = 20141, 99
- YYYYY, XX = 20151, 112
- YYYYY, XX = 20161, 112
- YYYYY, XX = 20171, 108
- YYYYY, XX = 20181, 99
- YYYYY, XX = 20191, 152
- YYYYY, XX = 20201, 43 (until Dec 29 2020)

In [None]:
from bs4 import BeautifulSoup
import nltk
import re
import os
import pandas as pd
from datetime import datetime
from collections import defaultdict, Counter

__Tokenize speeches__

__Notes__
* 20141, file 99 is removed because "mødet er aflyst"
* 20181, files 94,95,96,97,98,99 are removed because "mødet er aflyst"

In [None]:
stopwords = nltk.corpus.stopwords.words('danish')
#y_, iter_ = 20101,108
#y_, iter_ = 20111,102
#y_, iter_ = 20121,105 # <---- something is strangely formatted here
#y_, iter_ = 20131,109
#y_, iter_ = 20141,99
#y_, iter_ = 20151,112
#y_, iter_ = 20161,112
#y_, iter_ = 20171,108
#y_, iter_ = 20181,99
#y_, iter_ = 20191,152
#y_, iter_ = 20201,138 #not complete, disregard all data from 27 and up (not completed) under uderbejdelse - need to redownload
y_, iter_ = 20211,75

odd = set()
# go through files
#for ii in range(1,iter_+1):
for ii in [2]:
    
    # list for outputting
    dta = []

    # load url data
    fileIn = open('../data_collection/meetings/url_data/%d-%03d.html' % (y_,ii),'r')
    htmlText = fileIn.read()
    fileIn.close()
    
    # soupify it 
    soup = BeautifulSoup(htmlText, 'html.parser')
    
    # date of speech
    t = datetime.strptime(soup.find_all('span', attrs={'class':'video_date'})[0].text, '%d-%m-%Y kl. %H:%M')
    
    # go through individual speakers
    for speech in soup.find_all('div', attrs={'class': 'video-item-referat'}):
        
        # only if there is text
        if speech.text.strip() != '':
        
            # keep track of speakers
            speaker = speech.text.split('\n')[0].strip()
            speaker = speaker[speaker.index(u'\xa0')+1:]
            print(speaker)

            # find start_index, slice string and disregard name of speaker
            start_index = speech.text.index('\n')

            # clean string
            txt = re.sub(r'\d+|[\.!?,-]', '', speech.text.lower()[start_index:])
            
            # some more cleaning
            txt = re.sub(r'\d+|[/]', ' ', txt)

            # tokenize and remove stopwords
            #txt = [w for w in nltk.word_tokenize(txt) if len(w) > 2 and w not in stopwords]
            
            # keep text in original format
            txt = nltk.word_tokenize(txt)
            
            # save to list
            dta.append((speaker,t.strftime('%Y-%m-%d H%H:%M'),','.join(txt)))
        
    # transform list to dataframe
    df = pd.DataFrame(dta,columns=['speaker','date','tokens'])
    #df = df.replace({u'\xf8':u'\xc3\xb8'},regex=True)
    # handle encoding
    types = df.apply(lambda x: pd.api.types.infer_dtype(x.values))
    for col in types[types=='unicode'].index:
        df[col] = df[col].apply(lambda x: x.encode('utf-8').strip())
    # output tsv-file
    df.to_csv('tokenized_data/%d-%03d.tsv' % (y_,ii),sep='\t',index=False,encoding='utf-8')


## prepare data by creating dataframes of all uncleaned speeches per year
This is a much better way of processing the data, than what is used above

In [1]:
for y_ in ['20101', '20111']:
    
    # find all files from that specific year
    files_ = sorted([f for f in os.listdir('../data_collection/meetings/url_data/') if f[:5] == y_])

    # which classes to extract text from
    p_class = ['Tekst','TekstLuft','TekstIndryk','Pind','PindTekst']

    # append speeces to empty list
    data = []

    # go through files
    for file_ in files_:
        try:
            # load url data
            fileIn = open('../data_collection/meetings/url_data/' + file_,'r')
            htmlText = fileIn.read()
            fileIn.close()

            # soupify it 
            soup = BeautifulSoup(htmlText, 'html.parser')

            # date of speech
            t = datetime.strptime(soup.find_all('span', attrs={'class':'video_date'})[0].text, '%d-%m-%Y kl. %H:%M')

            # go through individual speeches
            for speech in soup.find_all('div', attrs={'class': 'video-item-referat'}):

                # only if there is text
                if speech.text.strip() != '':

                    # keep track of speakers
                    #speaker = speech.text.split('\n')[0].strip()
                    #speaker = speaker[speaker.index(u'\xa0')+1:]
                    speaker =  speech.findAll('a', attrs = {'href':['#pv']})[0].text.strip()

                    # extract text
                    text = ' '.join([i.text.strip() for i in speech.findAll('p', attrs={'class' : p_class})])
                    text = re.sub("\s\s+" , " ", text)

                    # save to list
                    data.append((speaker,t.strftime('%Y-%m-%d H%H:%M'),text))


                    # to find strange classes of paragraphs
                    #test = Counter() # move further up when running
                    #da_fuq = [] # move further up when running
                    #         for i in speech.findAll("p"):
                    #             for j in i['class']:
                    #                 test[j] += 1
                    #                 if j not in {'Tekst','TekstIndryk','TekstLuft'}:
                    #                     da_fuq.append(speech)

            # transform to dataframe
            df = pd.DataFrame(data,columns=['speaker','time','text'])

            # save to csv file
            df.to_csv('../data_collection/meetings/unprocessed_text/' + y_ + '.csv',index=False)

        except:
            print(file_)
            pass
