In [1]:
import pandas as pd
import numpy as np
import os
import csv
from datetime import datetime
from datetime import timedelta
from bs4 import BeautifulSoup

In [2]:
#pip install selenium

In [3]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [4]:
options = webdriver.ChromeOptions()
options.add_argument('user-agent = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36')
driver = webdriver.Chrome(chrome_options=options)
driver.get("https://genealogy.math.ndsu.nodak.edu/")

  driver = webdriver.Chrome(chrome_options=options)


In [5]:
#cargamos el contenido (ya se podría cerrar la pestaña)
soup = BeautifulSoup(driver.page_source)

In [6]:
main_content = soup.find(id='mainContent').table
authors_searched = main_content.find_all('tr')
print(authors_searched)

[<tr><td><a href="id.php?id=265747">Nguengue Louvouandou, Apépé Jugeandène</a></td>
<td>Marien Ngouabi University</td>
<td>2019</td></tr>, <tr><td><a href="id.php?id=79520">Pepe, James</a></td>
<td>Massachusetts Institute of Technology</td>
<td>1972</td></tr>, <tr><td><a href="id.php?id=25297">Pepe, Margaret</a></td>
<td>University of Washington</td>
<td>1986</td></tr>, <tr><td><a href="id.php?id=2593">Pepe, Wilfrid</a></td>
<td>Indiana University</td>
<td>1966</td></tr>, <tr><td><a href="id.php?id=137357">Pepe, William</a></td>
<td>University of Connecticut</td>
<td>2007</td></tr>, <tr><td><a href="id.php?id=141256">Peperko, Aljoša</a></td>
<td>University of Ljubljana</td>
<td>2008</td></tr>, <tr><td><a href="id.php?id=197815">Peperstraete, Jan</a></td>
<td></td>
<td></td></tr>, <tr><td><a href="id.php?id=109182">Quintana, Jose (Pepe)</a></td>
<td>University of Warwick</td>
<td>1987</td></tr>]


In [7]:
import re

# función que dada una lista de autores extraída de la búsqueda devuelve sus ids
# para luego hacer peticiones con cada id a https://genealogy.math.ndsu.nodak.edu/id.php?id=<id>
# y de ahí extraer la info
def authors_ids(authors_searched):
    ids = []
    for author in authors_searched:
        link = author.find('a').get('href')
        id = re.search('id=(.*)', link).group(1)
        ids.append(id)
    return ids

In [8]:
print(authors_ids(authors_searched))

['265747', '79520', '25297', '2593', '137357', '141256', '197815', '109182']


In [9]:
def authors_information(authors_id):
    """
    Realiza web scraping en las pestañas de los autores
    con id en 'authors_id'.
    
    Para cada id se scrapea la página de ese autor. Mediante las
    funciones principales de BeautifulSoup y las expresiones
    regulares, se van guardando, en este orden: nombre, universidad,
    año, pais, titulo, tutores y estudiantes.
    Finalmente, se crea una lista con los valores de estos campos y
    se añade a la lista 'information'.
    
    Parámetros:
        authors_id: lista de los ids de los autores obtenidos según
        la búsqueda

    Devuelve:
        information: lista anidada, en cada una la información de un autor/a.
    """
    information = []
    for identificador in authors_id:
        # LECTURA DE LA PÁGINA
        urls = driver.get("https://genealogy.math.ndsu.nodak.edu/id.php?id="+identificador)
        soup = BeautifulSoup(driver.page_source)

        # WEB SCRAPING
        main_content2 = soup.find(id='mainContent')
        
        # Nombre autor/autora
        nombre = main_content2.find('h2').get_text().strip() #strip() elimina los espacios en blanco al inicio y al final
        
        # Universidad
        universidad = main_content2.find_next('span').find_next('span').get_text()
        
        # Año de publicación de la tesis
        año = main_content2.find('span').get_text()
        año = ''.join(re.findall(r"\d{4}$", año))
        
        # País
        if main_content2.find('img'):
            pais = main_content2.find('img').get('title')
        else:
            pais = ''
            
        # Título de la tesis
        for div in range(0, len(main_content2.find_all('div'))):
            if div==3:
                # PRIMERA OPCIÓN
                #titulo = main_content2.find_all('div')[div].find_next('span').find_next('span').get_text().strip()
                # SEGUNDA OPCIÓN
                titulo = ''.join(main_content2.find_all('div')[div].contents[2].contents).strip()

        # Tutor/Tutores
        advisor_text = re.search(r'Advisor.*',main_content2.get_text()).group(0)
        advisors_pre = re.split(r"No students known.", advisor_text)[0]
        advisors = re.split(r"Advisor: |Advisor \d: ",advisors_pre)[1:]

        # Estudiantes
        students = []
        if main_content2.table != None:
            students_searched = main_content2.table.find_all('a')
            for stud in students_searched:
                students.append(stud.get_text())
        #else:
        #    students.append('')

        elemento = [nombre, universidad, año, pais, titulo, advisors, students]
        information.append(elemento)
    return information

In [12]:
info = authors_information(authors_ids(authors_searched))
info

[['Apépé Jugeandène  Nguengue Louvouandou',
  'Marien Ngouabi University',
  '2019',
  'Democratic Republic of the Congo',
  'Poisson structure on Weil bundle and generalization of the Weil bundle',
  ['Basile Guy Richard Bossoto'],
  []],
 ['James T. Pepe',
  'Massachusetts Institute of Technology',
  '1972',
  'UnitedStates',
  'Studies in Serial Learning and Saccadic Eye Movement',
  ['Stephen  Grossberg'],
  []],
 ['Margaret Sullivan Pepe',
  'University of Washington',
  '1986',
  'UnitedStates',
  '',
  ['Thomas Richard Fleming'],
  ['Alonzo, Todd',
   'Couper, David',
   'Dodd, Lori',
   'Huang, Ying',
   'Janes, Holly',
   'Moskowitz, Chaya',
   'Nelson, Jennifer',
   'Reilly, Marie',
   'Zhou, Haibo']],
 ['Wilfrid Dennis Pepe',
  'Indiana University',
  '1966',
  'UnitedStates',
  'Lebesgue Area and the Differential Geometry of C1 Surfaces',
  ['William P. Ziemer'],
  []],
 ['William  Pepe',
  'University of Connecticut',
  '2007',
  'UnitedStates',
  'On Some Bounded Risk Seq

In [13]:
import csv
headers = ['Nombre', 'Universidad', 'Año', 'País', 'Título', 'Tutores', 'Estudiantes']

# OPCIÓN 1
with open('mathematicians_dataset.csv', 'w', encoding='UTF8', newline='') as csvfile:
    for column in range(len(headers)):
        csvfile.write((headers[column]) + ";")
    csvfile.write("\n");
    for i in range(len(info)):
        for j in range(len(info[i])):
            csvfile.write(str(info[i][j]) + ";");
        csvfile.write("\n");

In [15]:
# OPCIÓN 2 (Las dos opciones salen igual)
with open('mathematicians_dataset2.csv', 'w', encoding='UTF8', newline='') as file:
    writer = csv.writer(file, delimiter=';')
    writer.writerow(headers)
    writer.writerows(info)

In [16]:
data = pd.read_csv("mathematicians_dataset2.csv", sep=";")
print(data)

                                   Nombre  \
0  Apépé Jugeandène  Nguengue Louvouandou   
1                           James T. Pepe   
2                  Margaret Sullivan Pepe   
3                     Wilfrid Dennis Pepe   
4                           William  Pepe   
5                         Aljoša  Peperko   
6                     Jan A. Peperstraete   
7                 Jose (Pepe) M. Quintana   

                             Universidad     Año  \
0              Marien Ngouabi University  2019.0   
1  Massachusetts Institute of Technology  1972.0   
2               University of Washington  1986.0   
3                     Indiana University  1966.0   
4              University of Connecticut  2007.0   
5                University of Ljubljana  2008.0   
6                                    NaN     NaN   
7                  University of Warwick  1987.0   

                               País  \
0  Democratic Republic of the Congo   
1                      UnitedStates   
2       