# 0.0 Packages Import

In [None]:
import os
from dotenv import load_dotenv

import re

import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 1.0 Define Parameters

### 1.1 Load Environment Parameters

In [None]:
load_dotenv()
chromedriver_path = os.getenv("CHROMEDRIVER_PATH")

### 1.2 Create a List of target URLs 

In [None]:
def url_list(url1,url2 = None):

    url_base = 'https://www.parliament.bg/bg/plenaryst/ns/55/ID/'

    if url2 == None:
        print('Number of URLs to scrape: 1')
        urls = [url1]
        return urls

    else:

        urls = []
        url1_num = int(url1.split('/')[8])
        url2_num = int(url2.split('/')[8])

        while url1_num < url2_num + 1:
            url_combined = url_base + str(url1_num)
            urls.append(url_combined)
            url1_num +=1

        print('Number of URLs to scrape: ' + str(len(urls)))
        return urls

In [26]:
url1 = 'https://www.parliament.bg/bg/plenaryst/ns/55/ID/10602'
url2 = 'https://www.parliament.bg/bg/plenaryst/ns/55/ID/10729'


url_list(url1)

Number of URLs to scrape: 1


['https://www.parliament.bg/bg/plenaryst/ns/55/ID/10602']

### 1.2 Use Regular Expressions to find the end of Parliament Hearing

In [None]:
#Regular Expressions to parse different Elements from Corpus

def end_of_hearing(text):
    end_of_hearing_pattern = re.compile(r'\d\d,\d\d\sч.\)\n{2,4}[А-Я][а-я]+')
    match = end_of_hearing_pattern.search(text)
    return match

def end_position_hearing(text):
    end_of_hearing_pattern = re.compile(r'\d\d,\d\d\sч.\)\n{2,4}[А-Я][а-я]+')
    match = end_of_hearing_pattern.search(text)
    end_position = match.start()
    return end_position

# 2.0 Scraping with Selenium

### 2.1 Set up Webdriver 

In [None]:
#Setup Crawler with an Explicit waiting strategy Fetch the corpus and map it to a dict

def scraper (urls, explicit_wait_seconds = 10, poll_frequency = 2):

    cService = webdriver.ChromeService(chromedriver_path)
    driver = webdriver.Chrome(service = cService)

    texts = []
    successful_urls = []
    unsuccessful_urls = []
    unsuccessful_messages = []

    for url in urls:

        driver.get(url)

        try:
            WebDriverWait(driver, explicit_wait_seconds,poll_frequency).until(EC.presence_of_element_located((By.XPATH, '/html/body/div/main/div/div/div[2]/div[1]/div/div[3]')))
            corpus = driver.find_element(By.ID, 'app')

            if end_of_hearing(corpus.text) is None:

                unsuccessful_urls.append(url)
                unsuccessful_messages.append('Initial Xpath located for Url but corpus is empty')

            else:

                successful_urls.append(url)
                texts.append(corpus.text)

        except:
            WebDriverWait(driver, explicit_wait_seconds, poll_frequency).until(EC.presence_of_element_located((By.XPATH, '//*[@id="app"]/main/div/div/div[2]/div[1]/div/div[2]')))
            corpus = driver.find_element(By.ID, 'app')

            if end_of_hearing(corpus.text) is None:

                unsuccessful_urls.append(url)
                unsuccessful_messages.append('Error triggered: Initial Xpath was not located and no corpus was found')

            else:

                successful_urls.append(url)
                texts.append(corpus.text)

    scraper_dict = {
        'texts' : texts,
        'successful_urls' : successful_urls,
        'unsuccessful_urls' : unsuccessful_urls,
        'unsuccessful_messages' : unsuccessful_messages
    }

    print('Number of scraped URLs: ' + str(len(successful_urls)) + ' (' + str(round(100*len(successful_urls)/len(urls),2)) +'% Success)' )

    return scraper_dict

In [None]:

#Get Basic Attributes, which will be added to each statement (Assembly Number, Hearing Number, Date)

def general_parser (text):


    assembly_pattern = re.compile(r'[А-Я]+\sИ\s[А-Я]+\sНАРОДНО\sСЪБРАНИЕ|[А-Я]+\sНАРОДНО\sСЪБРАНИЕ')
    session_pattern = re.compile(r'[А-Я]+\sИ\s[А-Я]+\sСЕСИЯ|[А-Я]+\sСЕСИЯ')
    hearing_pattern = re.compile(r'([А-Я]+\sИ\s[А-Я]+\s[А-Я]+\sЗАСЕДАНИЕ|[А-Я]+\sИ\s[А-Я]+\sЗАСЕДАНИЕ|[А-Я]+\s[А-Я]+\sЗАСЕДАНИЕ|[А-Я]+\sЗАСЕДАНИЕ)')


    assembly_matches = assembly_pattern.search(text)

    try:
        session_matches = session_pattern.search(text)
    except:
        session_matches = 'False'


    if session_matches == 'False':

        hearing_matches = hearing_pattern.search(text[session_matches.end():])
    else:
        hearing_matches = hearing_pattern.search(text[assembly_matches.end():])


    assembly = assembly_matches.group().title()
    hearing = hearing_matches.group().title()

    pattern_date = re.compile(r'(\d{2}).(\d{2}).(\d{4})') # find the date of the session
    matches_date = pattern_date.search(text)
    year = matches_date.group(3)
    month = matches_date.group(2)
    day = matches_date.group(1)
    date = year + '.' + month + '.' + day

    general_info_dict = {

        'assembly': assembly,
        'hearing': hearing,
        'date': date
                         }

    return general_info_dict

#Get Statements, politicians first and last names and political parties

def statements_parser (text,url):

    pattern_statements = re.compile(r'([А-Я]+\s)?([А-Я]+\s)([А-Я]+)(:|\s\((.+)\):)') # find  first name + last name + political party + statement
    match_position = pattern_statements.finditer(text)
    general_info_dict = general_parser(text)

    first_names = []
    last_names = []
    political_parties_raw = []
    assembly_roles = []
    statements = []
    start_positions_politician = []
    end_positions_politician = []
    assemblies = []
    hearings = []
    dates = []
    urls = []

    for index in match_position:

        first_names.append(index.group(2).title())
        last_names.append(index.group(3).title())
        political_parties_raw.append(index.group(4))
        end_positions_politician.append(index.end())
        start_positions_politician.append(index.start())

        if index.group(1) is None:
            assembly_roles.append('Политик')
        else:
            assembly_roles.append(index.group(1).title())

    number_statements = len(first_names)

    i=0

    while  i < number_statements:

        assemblies.append(general_info_dict.get('assembly'))
        hearings.append(general_info_dict.get('hearing'))
        dates.append(general_info_dict.get('date'))
        urls.append(url)
        i+=1




    start_position_statement = []
    end_position_statement = []
    last_hearing_position = end_position_hearing(text)

    i = 0

    while i < number_statements :

        if i == len(first_names) - 1:
            start_position_statement.append(end_positions_politician[i])
            end_position_statement.append(last_hearing_position)

            statement = text[end_positions_politician[i]:last_hearing_position]
            clean_statement = statement.translate({ord(i): None for i in '('}).replace('\n', ' ').strip()
            statements.append(clean_statement)

        else:
            start_position_statement.append(end_positions_politician[i])
            end_position_statement.append(start_positions_politician[i+1])

            statement = text[end_positions_politician[i]:start_positions_politician[i+1]]
            clean_statement = statement.replace('\n', ' ').strip()
            statements.append(clean_statement)

        i+=1

    political_parties = []
    speaking_locations = []

    for party in political_parties_raw:

        if party == ':':
            political_parties.append('Председателски Орган')
            speaking_locations.append('От Трибуната')
        else:
            if ', от' in party:
                clean = party.translate({ord(i): None for i in '():'}).strip()
                clean_split = clean.split(', ')
                political_parties.append(clean_split[0])
                speaking_locations.append(clean_split[1].title())

            elif 'встрани от микрофоните' in party:
                speaking_locations.append('От Място')
                political_parties.append('')

            else:
                clean = party.translate({ord(i): None for i in '():'}).strip()
                political_parties.append(clean)
                speaking_locations.append('От Трибуната')


    statements_dict = {
        'Народно Събрание': assemblies,
        'Заседание': hearings,
        'Дата': dates,
        'Позиция в Парламента': assembly_roles,
        'Първо Име': first_names,
        'Фамилно Име': last_names,
        'Партия': political_parties,
        'Говорил От': speaking_locations,
        'Изказване': statements,
        'Начална Позиция на Изказване': start_position_statement,
        'Крайна Позиция на Изказване': end_position_statement,
        'Линк към изказване': urls
    }

    return statements_dict

#Create Subfolders and save the ready CSV Files there

def path_exists (path):
    try:
        os.chdir(path)
        return 'true'
    except:
        return 'The specified directory path does not exist!'

def main_folder_creator (directory_path):

    directory_name = 'Scraper Results'

    #create main folder directory
    os.chdir(directory_path)

    try:
        os.mkdir(directory_name)
        print(f"Directory of scraper Results'{directory_path}\{directory_name}' successfully created.")
    except FileExistsError:
        pass
    except Exception as e:
        print(f"An error occurred: {e}")

    #create subfolders Hearings and Failed Reports
    folders_directory_path = os.path.join(directory_path, directory_name)
    os.chdir(folders_directory_path)

    try:
        os.mkdir('Failed Reports')
        os.mkdir('Hearings')
    except FileExistsError:
        pass
    except Exception as e:
        print(f"An error occurred: {e}")

#create subfolder for each assembly

def sub_folder_creator (assembly,directory_path):

    hearings_path = os.path.join(directory_path, 'Scraper Results','Hearings')
    os.chdir(hearings_path)

    try:
        os.mkdir(assembly)
        print(f"Directory '{hearings_path}\{assembly}' successfully created.")
    except FileExistsError:
        pass
    except Exception as e:
        print(f"An error occurred: {e}")

def save_df (statements_dict, directory_path):

    df = pd.DataFrame.from_dict(statements_dict)
    folder_path = 'Scraper Results\Hearings'
    save_path = os.path.join(directory_path, folder_path,statements_dict.get('Народно Събрание')[0])
    df.to_csv(save_path +'\{date}.csv'.format(date= statements_dict.get('Дата')[0]), encoding='utf-8-sig')

#Create a function Iterating through texts and mapping texts to CSV and subsequently saving them

def parser (scraper_dict,directory_path):

    texts = scraper_dict.get('texts')
    urls = scraper_dict.get('successful_urls')
    failed_scraping_urls = scraper_dict.get('unsuccessful_urls')
    failed_scraping_messages = scraper_dict.get('unsuccessful_messages')


    failed_mapping_urls = []
    failed_mapping_messages = []
    main_folder_creator (directory_path)

    for text,url in zip(texts,urls):

        try:
            statements_dict = statements_parser(text,url)
            assembly = statements_dict.get('Народно Събрание')[0]
            sub_folder_creator(assembly,directory_path)
            save_df(statements_dict,directory_path)

        except:
            failed_mapping_urls.append(url)
            failed_mapping_messages.append('The parsing failed')


    done_count = len(urls) - len(failed_mapping_urls)
    success_rate = str(round(100*done_count / len(texts),2))

    print('Parsed and Saved ' + str(done_count) + ' Texts (' + success_rate + '% Success)' )

    failed_urls = failed_mapping_urls + failed_scraping_urls
    failed_messages = failed_mapping_messages + failed_scraping_messages

    failed_dict = { 'Url': failed_urls,
                    'Message': failed_messages
    }

    df_failed = pd.DataFrame.from_dict(failed_dict)
    failed_subfolder_path = 'Scraper Results\Failed Reports'
    failed_path = os.path.join(directory_path,failed_subfolder_path)
    df_failed.to_csv(failed_path + '\Failed_Report_' +str(len(failed_mapping_urls))+ '.csv', encoding='utf-8-sig')


In [37]:
#Combine All Methods

def parliament_scraper (url1,url2,directory_path):

    if path_exists(directory_path) == 'true':
        urls = url_list(url1,url2)
        scraper_dict = scraper(urls)
        parser(scraper_dict,directory_path)
    else:
        print('Invalid folder directory provided. Check directory_path variable')

Function which combines all previous Functions

In [None]:
directory_path = r'C:\Users\ivank\Desktop'

url1 = 'https://www.parliament.bg/bg/plenaryst/ns/55/ID/10602'
url2 = 'https://www.parliament.bg/bg/plenaryst/ns/55/ID/10729'

explicit_wait_seconds = 10
poll_frequency = 2


parliament_scraper(url1, url2, directory_path)

Invalid folder directory provided. Check directory_path variable
