In [1]:
# Uncomment these lines if you are running the notebook on Google Colab.
# ! git init
# ! git remote add origin https://github.com/jose-lopez/lemmas_finder.git
# ! git pull origin master

In [2]:
! pip install webdriver-manager
! pip install selenium
! pip install pandas



In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from urllib.parse import quote
from os import path
from pathlib import Path
import os
import pandas as pd
from numpy.core.numeric import nan
import re

In [3]:
# If google chrome is not installed this method takes care about it.

def install_browser():

    print(f'Checking Google Chrome installation....' + "\n")

    with os.popen("google-chrome --version") as f:
        browser = f.readlines()

    if len(browser):

        print(f'Google Chrome version: {browser[0]}' + "\n")

    else:

        print(f'... Installinng Google Chrome' + "\n")

        try:

            print(os.popen('wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb').read())
            print(os.popen('apt install ./google-chrome-stable_current_amd64.deb').read())

        except Exception as exc:

            print(exc)

            exit(1)

Google Chrome version: Google Chrome 115.0.5790.98 




In [5]:
# This function returns the google chrome browser used below to open 
# and scrap an html page from a given URL.

def get_browser():

    chrome_options = webdriver.ChromeOptions()

    chrome_options.add_argument('--no-sandbox')

    chrome_options.add_argument('--disable-dev-shm-usage')

    chrome_options.add_argument("--headless=new")

    browser = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)

    return browser

In [6]:
# This is the main method. Given a token, a possible lemma is searched for it.

def get_lemma(browser, file, line, token, logs):

    url_base = "https://logeion.uchicago.edu/morpho/"

    url = url_base + quote(token)

    browser.get(url)  # navigate to URL

    try:

        # Waiting for a totally deployed URL.

        WebDriverWait(browser, 10).until(EC.text_to_be_present_in_element((By.TAG_NAME, "h3"), "Short Definition"))

    except NoSuchElementException:

        lemma = nan

        logs.write(f'An exception of type NoSuchElementException in File: {file} at line: {line}, token {token}' + "\n")

    except TimeoutException:

        lemma = nan

        logs.write(f'An exception of type TimeoutException in File: {file} at line: {line}, token {token}' + "\n")

    else:

        try:

            browser.find_element(By.XPATH, "//*[contains(text(), 'Could not find the search term')]")

            lemma = nan

        except NoSuchElementException:

            possible_lemma = browser.find_element(By.CSS_SELECTOR, 'a.ng-binding').text

            invalid_lemma = re.search(r'[a-zA-Z0-9]+', possible_lemma)

            if invalid_lemma:

                lemma = nan

            else:

                lemma = possible_lemma

    finally:

        return lemma

In [7]:
# A method to check basic errors in a token and its lemma.

def check_warning(token, lemma):

    warning = None

    invalid_token = re.search(r'[a-zA-Z0-9]+', token)

    if lemma is not nan:

        invalid_lemma = re.search(r'[a-zA-Z0-9]+', lemma)

    else:

        invalid_lemma = None

    if invalid_token and not invalid_lemma:

        warning = 1

    if not invalid_token and invalid_lemma:

        warning = 2

    if invalid_token and invalid_lemma:

        warning = 3

    return warning

In [8]:
# Setting the main folders

folders = ['processed', 'warnings', 'logs']

root = "./text/"

corpus = root + "corpus"

for folder in folders:

  _path = root + folder
  if not path.exists(_path):
    os.mkdir(_path)

In [9]:
# Checking if we have the corpus's files ready to go.
! ls -l ./text/corpus

total 444
-rw-r--r-- 1 jose-lopez jose-lopez 452366 ago  8 22:08 aeschylus_i.csv


In [10]:
# Installing google chrome

install_browser()

In [None]:
# Getting an instances of the browser to consult urls, scrap the related
# html pages and get (scrap) lemmas from them.

browser = get_browser()

In [11]:
"""

This cell processes the *.csv files located in the corpus folder, 
producing updated versions of these, stored in a folder called
"processed". Files with warnings and logs are also generated for
each *.csv file.

A file of the type "warnings" informs about possible syntatical errors
in tokens and lemmas in the input files.

A "log" type file, on the other hand, reports problems found when trying to access a URL, when trying to
get a lemma for a given token.

The "processed" folder also includes files listing all the new lemmas
found for each token in each one of the input files.

"""

files = [str(x) for x in Path(corpus).glob("**/*.csv")]

files_to_process = len(files)

warnings_in_file = []

processed_files = 0

for file in files:   

  file_name = "/" + file.split("/")[-1]

  file_root_name = file_name.split(".")[0]

  processed_files += 1

  processed_file = root + folders[0] + file_name

  warnings_file = root + folders[1] + file_root_name + "_warnings" + ".csv"

  logs_file = root + folders[2] + file_root_name + "_logs"  + ".csv"

  logs = open(
      logs_file, 'w', encoding="utf8")

  input_df = pd.read_csv(file)

  print(f'Getting lemmas for {file} file: {processed_files} | {files_to_process}' + "\n")

  for x in input_df.index:

    token = input_df.loc[x, "token"]

    lemma = input_df.loc[x, "lemma"]

    warning = check_warning(token, lemma)

    if warning:

      warnings_in_file.append([x, token, lemma, warning])

    if lemma is nan:

      lemma = get_lemma(browser, file, x, token, logs)

      print(f'token = {token}   lemma = {lemma}' + "\n")

      input_df.loc[x, "lemma"] = lemma

  input_df.to_csv(processed_file)

  # Building the warnings' file, if there are any, for the actual file in process.

  if len(warnings_in_file) != 0:

    print(f'Warnings found for {file} file. A report in {warnings_file}')

    warnings_df = pd.DataFrame(warnings_in_file, columns=['line', 'token', 'lemma', 'error_type'])

    warnings_df.to_csv(warnings_file)

  logs.close()

print(f'..... done')

Getting lemmas for text/corpus/aeschylus_i.csv file: 1 | 1

token = ἀρθέντʼ   lemma = ἀραρίσκω

token = προστομίων   lemma = προστόμιον

token = λεπτοψαμάθων   lemma = λεπτοψάμαθος

token = Δίαν   lemma = Δῖος

token = λιποῦσαι   lemma = λείπω



KeyboardInterrupt: 