# Set up environment

In [None]:
import sys
print(sys.prefix)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import urllib, os,sys, pdfplumber, glob, requests, wordcloud, re, dateparser

# Set up working dir

In [None]:
base_dir = os.path.realpath('../..')
print(base_dir)
data_dir = base_dir + '\Data'

# Load text files with date

In [None]:
in_dir = data_dir + '\\TK_commissieVWS\\auto_download_20230118'

In [None]:
all_files = glob.glob(in_dir + '\\text\\*.txt')
len(all_files)

In [None]:
all_text = pd.DataFrame(columns = ['file_nr','doc_nr','doc_id','date','name','text','nr_of_words'])
for fi,fpath in enumerate(all_files):
    if np.mod(fi,10) == 0:
        print(fi, end = ', ')
    fname = fpath.split('\\')[-1]
    [doc_nr, doc_id, doc_date, doc_name1, doc_name2] = fname.split('_')
    doc_nr = int(doc_nr[3:])
    doc_date = dateparser.parse(doc_date)
    doc_name = doc_name1 + doc_name2
    try:
        doc_text = open(fpath, "r", encoding="utf8").read().lower() # Important: removed the wiping of newlines here
        doc_nr_of_words = len(re.findall(r'\w+', doc_text))
        all_text = all_text.append(pd.DataFrame([[fi, doc_nr, doc_id, doc_date,doc_name,doc_text,doc_nr_of_words]],
                                            columns = all_text.columns))
    except:
        print('Skipping %s'%fname)
all_text = all_text.reset_index(drop=True)

In [None]:
all_text.head()

In [None]:
# Show an example
all_text.loc[all_text.date == '2021-09-15','text'].iloc[0]

## Add missing dates

In [None]:
def find_xtol(fpath):
    
    # Find
    print('>> %s'%fpath.split('\\')[-1])

    # Read
    pdf = pdfplumber.open(fpath)
    cover_page = pdf.pages[0]
    content_pages = pdf.pages[2:]
    
    ### 2. Determine number of columns in page somewhere in the middle
    if len(content_pages) > 5:
        page = content_pages[int(len(content_pages)/2+2)] # This is to ensure that we don't evaluate the first page but also don't go over
    elif len(content_pages) > 1:
        page = content_pages[1]
    else:
        page = content_pages[0]

    n_slivers = 100
    sliver_edges = np.round(np.linspace(0,1,n_slivers+1), decimals = 3)
    sliver_contents = []
    for sliver_start, sliver_end in zip(sliver_edges[:-1],sliver_edges[1:]):
        bbox = (sliver_start*float(page.width), 0, sliver_end*float(page.width), float(page.height))
        sliver = page.crop(bbox)
        sliver_text = sliver.extract_text()
        sliver_text = sliver_text.replace('\n','') if sliver_text is not None else ''
        sliver_contents.append(len(sliver_text))

    sliver_yesno = sliver_contents > np.mean(sliver_contents)
    n_cols = int(sum(np.diff(sliver_yesno)) / 2)
    n_cols = 2 if (n_cols == 2 and sliver_yesno[50] == False) else 1 # Double check: the middle sliver must be below the mean text density (i.e. whitespace in the middle between 2 columns)
    print('Number of columns: %i'%n_cols)

    ### 3. Extract text with appropriate x tolerance

    # Extract text
    acceptable_word_length = False
    xtol = 3
    left_bbox = (0, 0, 0.5*float(page.width), float(page.height))
    right_bbox = (0.5*float(page.width), 0, float(page.width), float(page.height))

    while acceptable_word_length == False:
        verslag_pages = []

        # Loop over content pages
        print('Reading pdf...')
        for page in content_pages:

            # Read by column
            if n_cols == 2:
                left_col = page.crop(left_bbox)
                left_col_text = left_col.extract_text(x_tolerance = xtol)
                if left_col_text is not None:
                    right_col = page.crop(right_bbox)
                    right_col_text = right_col.extract_text(x_tolerance = xtol)
                    page_text = left_col_text + ' ' + right_col_text
                else:
                    page_text = left_col_text # If left column is empty, we skip the right column
            elif n_cols == 1:
                page_text = page.extract_text(x_tolerance = xtol)
            else:
                raise ValueError('Impossible number of columns: %i'%n_cols)

            # Check for empty pages
            if page_text == None:
                print('Skipping %s'%page)
            else:
                verslag_pages.append(page_text)
        verslag_text = ' '.join(verslag_pages)

        # Check mean word length and repeat if necessary
        mean_word_length = np.divide(len(verslag_text), len(verslag_text.split(' ')))
        if mean_word_length > 40:
            if xtol == 1: # If the xtol is already at 1, skip this file
                print('Mean word length = %.2f, still too long. Skipping doc...'%mean_word_length)
                return
            else: # Otherwise lower the xtol to 1
                print('Mean word length = %.2f, lowering space x tolerance to 1.'%mean_word_length)
                xtol = 1
        else:
            print('Continuing with mean word length = %.2f.'%mean_word_length)
            acceptable_word_length = True
    
    return xtol

In [None]:
ids_to_find_date = all_text.loc[pd.isnull(all_text['date']),'doc_id'].tolist()
[print(a, end = ', ') for a in ids_to_find_date];

In [None]:
for doc_id in ids_to_find_date:
    print('Doc ID = %s'%doc_id)
    matches = glob.glob(in_dir + '\\original\\*%s*.pdf'%doc_id)
    if len(matches)>1:
        raise ValueError('Too many matches')
    else:
        fpath = matches[0]
        xtol = find_xtol(fpath)
        pdf = pdfplumber.open(fpath)
        cover_page = pdf.pages[0]
        cover_page_text = cover_page.extract_text(x_tolerance = xtol).replace('\n', ' ')
        after_vastgesteld = cover_page_text[cover_page_text.find('astgesteld')+30:]
        str_to_find = '\s+\d\d?\s+\w+\s+\d\d\d\d'
        # 'op\s\d+\s\w+\s\d+\soverleg' # Sometimes it's "heeft op donderdag 25 mei 2016 overleg..." or "hebben op 4 juni gesprekken gevoerd..." so the text is not consistent enough to match anything else than the date
        date_found = re.findall(str_to_find, after_vastgesteld)[0]
        date_formatted = dateparser.parse(date_found)
        date_string = date_formatted.strftime('%Y.%m.%d')
        print('Datum vergadering: %s'%date_string)
        
        all_text.loc[all_text['doc_id']==doc_id,'date'] = date_formatted
        print('Date saved')

In [None]:
all_text.to_csv(in_dir + '\\text\\all_text.csv')

## Split by speaking turn, label party

In [None]:
all_text = pd.read_csv(in_dir + '\\text\\all_text.csv', index_col = 0)

This function splits text into speaking turns using the format of the verslagen: e.g. "meneer Van Haga (FVD):". It also replaces newline characters by spaces.

In [None]:
def split_speaking_turns_explicit(in_text, verbose = False):
    # Split the text by all possible speakers: de heer, mevrouw, voorzitter, minister.
    in_text = in_text.replace('tweede kamer, vergaderjaar','')
    turns_list = re.split(r'(de voorzitter:)|(mevrouw.*?\):)|(de heer.*?\):)|(minister.*?\:)', in_text)
    turns_list = [t for t in turns_list if t is not None]
    turns_df = pd.DataFrame()
    turns_df['speaker'] = turns_list[1::2]
    turns_df['text'] = turns_list[2::2]
    turns_df_clean = pd.DataFrame()
    for ti,turn in turns_df.iterrows():
        turn_text = turn['text'].replace('-\n','')
        turn_text = turn_text.replace('\n',' ')
        if verbose:
            print(ti,end = ',')
        if 'voorzitter' in turn['speaker']:
            name_clean = 'voorzitter'
            gender = 'n/a'
            party = 'n/a'
        elif 'minister' in turn['speaker']:
            name_clean = 'minister'
            gender = 'n/a'
            party = 'n/a'
        else:
            name = turn['speaker'].split('(')[0]
            party = turn['speaker'].split('(')[1].split(')')[0]
            if 'mevrouw' in turn['speaker']:
                gender = 'mevrouw'
                name_clean = name.split('mevrouw')[1].strip(' ')
            else:
                gender = 'de heer'
                name_clean = name.split('de heer')[1].strip(' ')
        tmp = pd.DataFrame([[ti,gender,name_clean,party,turn_text]],
                           columns = ['turn_nr','gender','name','party','text'])
        turns_df_clean = turns_df_clean.append(tmp).reset_index(drop=True)
    if verbose:
        print('\n\n%i speaking turns found for parties %s'%(turns_df_clean.shape[0], sorted(turns_df_clean['party'].unique())))
    return turns_df_clean

##### Loop and parse all verslagen

In [None]:
verslagen_in = all_text.copy()

In [None]:
all_speaking_turns = pd.DataFrame()

In [None]:
for di,doc_info in verslagen_in.iterrows():
    speaking_turns = split_speaking_turns_explicit(doc_info['text'])
    speaking_turns['date'] = doc_info['date']
    speaking_turns['doc_nr'] = doc_info['doc_nr']
    speaking_turns['doc_id'] = doc_info['doc_id']
    n_speaking_turns = int(np.shape(speaking_turns)[0])
    all_speaking_turns = all_speaking_turns.append(speaking_turns)
    print(di,'Doc %i done, date %s, %i speaking turns found'%(
        doc_info['doc_nr'],doc_info['date'],n_speaking_turns))
    verslagen_in.loc[di,'nr_of_turns'] = n_speaking_turns
all_speaking_turns = all_speaking_turns.reset_index(drop=True)
all_speaking_turns.head()

In [None]:
all_speaking_turns['party'].unique()

In [None]:
all_speaking_turns.shape

## Save

In [None]:
all_speaking_turns.to_csv(in_dir + '\speaking_turns.csv')