## Web Scraping to get sentences in Brazilian Sign Language and their translations into Portuguese.

Website: <https://www.ines.gov.br/dicionario-de-libras/>

### Import Lib's

In [1]:
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import io
import os
import sys
import csv
import time
import random
import string
import logging as logger

os.environ["PATH"] = "$PATH:/usr/bin/"

logger.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logger.INFO,
    stream=sys.stdout
)

### Function to get the chrome driver

In [2]:
def get_chrome_driver():
    """
    Initialize a chrome driver with custom download dir
    """

    chrome_options = webdriver.ChromeOptions()

    prefs = {"download.default_directory": os.path.abspath('/usr/bin/'), "disable-blink-features": "AutomationControlled", "incognito": True}
    chrome_options.add_experimental_option("prefs", prefs)

    return webdriver.Chrome(executable_path=ChromeDriverManager().install(), chrome_options=chrome_options)

### Get the chrome driver

In [None]:
# Access the page
driver = get_chrome_driver()
wait = WebDriverWait(driver, 10)
driver.get("https://www.ines.gov.br/dicionario-de-libras/")
time.sleep(2)

### Alphabet List
We will use this list to get the data from the website, for each letter we will get a set of data.

In [97]:
alphabet_list = list(string.ascii_uppercase)

### Running the process to access the website and extract each feature

In [None]:
libras_dictionary_dataframe = pd.DataFrame()

for letter in alphabet_list:

    driver = get_chrome_driver()
    wait = WebDriverWait(driver, 10)
    driver.get("https://www.ines.gov.br/dicionario-de-libras/")
    time.sleep(2)

    logger.info(f"Processing letter - {letter}")
    
    driver.find_element(By.XPATH, f'//*[@id="letter-{letter}"]').click()

    words_list_selector_element = driver.find_element(By.XPATH, '//*[@id="input-palavras"]')
    word_list_options_element = words_list_selector_element.find_elements(By.TAG_NAME, "option")
    word_list_options_element = [x for x in word_list_options_element]
    word_list_options_element_id = [x.get_attribute("id") for x in word_list_options_element]
    word_list = {x.text: id for x, id in zip(word_list_options_element, range(1, len(word_list_options_element_id) + 1)) if x.text != "-- SELECIONE --"}

    for word, word_id in word_list.items(): # dict(list(word_list.items())[:5]).items():

        logger.info(f"Word: {word}- Word ID: {word_id}")

        # Select the word
        driver.find_element(By.XPATH, f'/html/body/div/div[2]/div/div/div[3]/div[2]/div[2]/select/option[{word_id}]').click()

        # Get the subject
        subject_selector_element = driver.find_element(By.XPATH, '//*[@id="input-assunto"]')
        subject_option_element = subject_selector_element.find_element(By.XPATH, f'/html/body/div/div[2]/div/div/div[3]/div[2]/div[1]/select/option').text

        # Get the interpretation of the word
        interpretation_div_element = driver.find_element(By.XPATH, '//*[@id="input-acepcao"]').text
        
        # Get sentence in portuguese
        example_portuguese_sentence_div_element = driver.find_element(By.XPATH, '//*[@id="input-exemplo"]').text

        # Get sentence in Libras
        example_libras_sentence_div_element = driver.find_element(By.XPATH, '//*[@id="input-libras"]').text

        # Get grammar class
        grammar_class_div_element = driver.find_element(By.XPATH, '//*[@id="input-classe"]').text

        # Get word origin
        word_origin_div_element = driver.find_element(By.XPATH, '//*[@id="input-origem"]').text

        # Get video link
        try:
            video_link_div_element = driver.find_element(By.XPATH, '//*[@id="input-video"]')
            video_link_div_element = video_link_div_element.find_element(By.XPATH, '//*[@id="input-video"]/video/source').get_attribute("src")
        except NoSuchElementException as e:
            video_link_div_element = "UNAVAILABLE"

        # Get image link
        try:
            image_link_div_element = driver.find_element(By.XPATH, '//*[@id="input-image"]')
            image_link_div_element = image_link_div_element.find_element(By.XPATH, '//*[@id="input-image"]/img').get_attribute("src")
        except NoSuchElementException as e:
            image_link_div_element = "UNAVAILABLE"

        # Get hand image link
        try:
            hand_image_link_div_element = driver.find_element(By.XPATH, '//*[@id="input-mao"]')
            hand_image_link_div_element = hand_image_link_div_element.find_element(By.XPATH, '//*[@id="input-mao"]/img').get_attribute("src")
        except NoSuchElementException as e:
            hand_image_link_div_element = "UNAVAILABLE"

        libras_dict = {"word": word,
            "subject": subject_option_element,
            "interpretation": interpretation_div_element,
            "example_portuguese_sentence": example_portuguese_sentence_div_element,
            "example_libras_sentence": example_libras_sentence_div_element, 
            "grammar_class": grammar_class_div_element, 
            "word_origin": word_origin_div_element, 
            "video_link": video_link_div_element, 
            "image_link": image_link_div_element, 
            "hand_image_link": hand_image_link_div_element
        }

        libras_dictionary_dataframe = pd.concat([libras_dictionary_dataframe, pd.DataFrame(libras_dict, index=[0])], ignore_index=True, axis=0)

    logger.info(f"Finished processing letter - {letter}")
    driver.close()


In [100]:
libras_dictionary_dataframe.head()

Unnamed: 0,word,subject,interpretation,example_portuguese_sentence,example_libras_sentence,grammar_class,word_origin,video_link,image_link,hand_image_link
0,A,NENHUM,Primeira letra do alfabeto da língua portugues...,Invente qualquer palavra que comece com a letr...,VOCÊ INVENTAR QUALQUER PALAVRA COMEÇAR A.,SUBSTANTIVO,Nacional,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...
1,ABACATE,FRUTA,"O fruto do abacateiro. Comestível, tem a polpa...",Você gosta de abacate com leite?,VOCÊ GOSTAR ABACATE LEITE JUNTO?,SUBSTANTIVO,Nacional,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...
2,ABACAXI,FRUTA,Fruta de casca grossa e áspera. Sua polpa pode...,"Hoje tomei suco de abacaxi, ele estava ácido.",HOJE S-U-C-O ABACAXI BEBER ÁCID@.,SUBSTANTIVO,Nacional,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...
3,ABAFAR,NENHUM,"Cobrir ou fechar, para manter o calor.","Se você quer abafar seu quarto, é melhor fecha...",S-I VOCÊ QUERER QUARTO SE@ ABAFAR A-R? MELHOR ...,VERBO,Nacional,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...
4,ABAIXO,NENHUM,"Lugar, posição ou situação inferior, em relaçã...","Não é no primeiro apartamento abaixo, é no seg...",APARTAMENTO PRIMEIR@ NÃO SEGUND@ ABAIXO.,ADV.,Nacional,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...


### Check details about the data

In [144]:
libras_dictionary_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5776 entries, 0 to 5775
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   word                         5776 non-null   object
 1   subject                      5776 non-null   object
 2   interpretation               5776 non-null   object
 3   example_portuguese_sentence  5776 non-null   object
 4   example_libras_sentence      5776 non-null   object
 5   grammar_class                5776 non-null   object
 6   word_origin                  5776 non-null   object
 7   video_link                   5776 non-null   object
 8   image_link                   5776 non-null   object
 9   hand_image_link              5776 non-null   object
dtypes: object(10)
memory usage: 451.4+ KB


### Save the data as file

In [None]:
libras_dictionary_dataframe.to_csv("../data/libras_dictionary.csv", index=False)