# Load python packages

In [None]:
import sys
print(sys.prefix)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import urllib, os,sys, pdfplumber, glob, requests, wordcloud, re, dateparser, tqdm

# Set up working dir

In [None]:
base_dir = os.path.realpath('../..')
print(base_dir)
data_dir = base_dir + '\Data'

# List all files

In [None]:
in_dir = data_dir + '\TK_commissieVWS\\auto_download_20230118\\original'
out_dir = data_dir + '\TK_commissieVWS\\auto_download_20230118\\text'

In [None]:
file_info = pd.read_csv(in_dir + '\downloaded_doc_info.csv', index_col = 0)
file_info.head(n=3).append(file_info.tail(n=3))

In [None]:
file_info.shape

# Function

In [None]:
def write_text_file(in_dir, in_fname, out_dir, out_fname):
    
    # Find
    print('>> %s'%in_fname)
    in_fullpath = '%s%s'%(in_dir, in_fname)

    # Read
    pdf = pdfplumber.open(in_fullpath)
    cover_page = pdf.pages[0]
    content_pages = pdf.pages[2:]

    ### 2. Determine number of columns in page somewhere in the middle
    if len(content_pages) > 5:
        page = content_pages[int(len(content_pages)/2+2)] # This is to ensure that we don't evaluate the first page but also don't go over
    elif len(content_pages) > 1:
        page = content_pages[1]
    else:
        page = content_pages[0]

    n_slivers = 100
    sliver_edges = np.round(np.linspace(0,1,n_slivers+1), decimals = 3)
    sliver_contents = []
    for sliver_start, sliver_end in zip(sliver_edges[:-1],sliver_edges[1:]):
        bbox = (sliver_start*float(page.width), 0, sliver_end*float(page.width), float(page.height))
        sliver = page.crop(bbox)
        sliver_text = sliver.extract_text()
        sliver_text = sliver_text.replace('\n','') if sliver_text is not None else ''
        sliver_contents.append(len(sliver_text))

    sliver_yesno = sliver_contents > np.mean(sliver_contents)
    n_cols = int(sum(np.diff(sliver_yesno)) / 2)
    n_cols = 2 if (n_cols == 2 and sliver_yesno[50] == False) else 1 # Double check: the middle sliver must be below the mean text density (i.e. whitespace in the middle between 2 columns)
    print('Number of columns: %i'%n_cols)

    ### 3. Extract text with appropriate x tolerance

    # Extract text
    acceptable_word_length = False
    xtol = 3
    left_bbox = (0, 0, 0.5*float(page.width), float(page.height))
    right_bbox = (0.5*float(page.width), 0, float(page.width), float(page.height))

    while acceptable_word_length == False:
        verslag_pages = []

        # Loop over content pages
        print('Reading pdf...')
        for page in content_pages:

            # Read by column
            if n_cols == 2:
                left_col = page.crop(left_bbox)
                left_col_text = left_col.extract_text(x_tolerance = xtol)
                if left_col_text is not None:
                    right_col = page.crop(right_bbox)
                    right_col_text = right_col.extract_text(x_tolerance = xtol)
                    page_text = left_col_text + ' ' + right_col_text
                else:
                    page_text = left_col_text # If left column is empty, we skip the right column
            elif n_cols == 1:
                page_text = page.extract_text(x_tolerance = xtol)
            else:
                raise ValueError('Impossible number of columns: %i'%n_cols)

            # Check for empty pages
            if page_text == None:
                print('Skipping %s'%page)
            else:
                verslag_pages.append(page_text)
        verslag_text = ' '.join(verslag_pages)

        # Check mean word length and repeat if necessary
        mean_word_length = np.divide(len(verslag_text), len(verslag_text.split(' ')))
        if mean_word_length > 40:
            if xtol == 1: # If the xtol is already at 1, skip this file
                print('Mean word length = %.2f, still too long. Skipping doc...'%mean_word_length)
                return
            else: # Otherwise lower the xtol to 1
                print('Mean word length = %.2f, lowering space x tolerance to 1.'%mean_word_length)
                xtol = 1
        else:
            print('Continuing with mean word length = %.2f.'%mean_word_length)
            acceptable_word_length = True

    ### 4. Page 1: find " op DATUM overleg gevoerd" => extract date
    cover_page_text = cover_page.extract_text(x_tolerance = xtol).replace('\n', ' ')
    after_vastgesteld = cover_page_text[cover_page_text.find('astgesteld')+30:]
    str_to_find = '\s+\d\d?\s+\w+\s+\d\d\d\d'
    # 'op\s\d+\s\w+\s\d+\soverleg' # Sometimes it's "heeft op donderdag 25 mei 2016 overleg..." or "hebben op 4 juni gesprekken gevoerd..." so the text is not consistent enough to match anything else than the date
    date_found = re.findall(str_to_find, after_vastgesteld)[0]
    date_formatted = dateparser.parse(date_found)
    date_string = date_formatted.strftime('%Y.%m.%d')
    print('Datum vergadering: %s'%date_string)

    out_fullpath = out_dir + "\\%s.txt"%(out_fname)

    ### 5. Save
    text_file = open(out_fullpath, "w", encoding='utf-8')
    n = text_file.write(verslag_text)
    print('Wrote text file with n = %i'%n)
    text_file.close()

# Try single file

In [None]:
file = file_info.sample(n=1).iloc[0,:]
file

In [None]:
in_fname = file['fname']
out_fname = 'doc%04d_%s_%s_%s'%(file['doc'], file['doc_id'], file['debate_date'], file['short_name'])
print(in_dir, '\n', in_fname, '\n')
print(out_dir, '\n', out_fname)

In [None]:
write_text_file(in_dir, in_fname, out_dir, out_fname)

# Loop over many files

In [None]:
file_info.shape[0]

In [None]:
# Get files
for file_id in tqdm.tqdm(np.arange(0,730)):
    print('File ID #%i'%file_id)
    file = file_info.iloc[file_id,:]
    in_fname = file['fname']
    out_fname = 'doc%04d_%s_%s_%s'%(file['doc'], file['doc_id'], file['debate_date'], file['short_name'])
    try:
        write_text_file(in_dir, in_fname, out_dir, out_fname)
    except:
        print('*** SKIPPING this file with ID %i, some scraping problem could not be solved'%file_id)