In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm

pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',100)

plt.style.use('dark_background')

# Scrape data

### Read first webpage with list of computer scientists

In [None]:
# Read webpage and parse with bs4
base_url = 'https://en.wikipedia.org/'
list_url = 'wiki/List_of_computer_scientists'
url = base_url + list_url
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, 'html.parser')

# Get names of scientists and links to their webpages
links = {}
div = soup.find('div', class_="mw-content-ltr mw-parser-output")
for ul in div.find_all('ul')[:-3]:
    for li in ul.find_all("li"):
        a = li.find("a")
        href = a['href']
        links[a.text] = href
    # end
# end

### Scrape individual pages

In [20]:
# Number of pages to scrape
count = 0

# Get data from all pages
cs_data = {}
for k, v in tqdm(links.items(), ncols=50):
    # Read individual webpage
    url = base_url + v
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    
    # Scrape data from individual page
    info = {}
    try:
        table = soup.find('table', class_="infobox biography vcard")
        for tr in table.find_all("tr")[2:]:
            try:
                # Read table header (will become column in dataframe)
                a = tr.find('th').text
                
                # Read table data
                b = []
                sp = tr.find('td').find_all("span")
                if len(sp)>0:
                    b += [ item.text for item in sp ]
                
                li = tr.find('td').find_all("li")
                if len(li)>0:
                    b += [ item.text for item in li ]
                
                href = tr.find('td').find_all("a")
                if len(href)>0:
                    b += [ item.text for item in href ]
                
                b = ",".join(b)
                
                # Compile data into dictionary
                info[a] = b
            except:
                1
            # end
        # end

        cs_data[k] = info
        count += 1
    except:
        1
    # end
    
    # Only read several pages
    if count >= 25:
        break
    # end
# end

print("done")

  4%|▍           | 26/695 [00:06<02:58,  3.76it/s]

done





In [21]:
df = pd.DataFrame(cs_data).T

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 25 entries, Wil van der Aalst to John Backus
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Born               20 non-null     object
 1   Nationality        8 non-null      object
 2   Known for          17 non-null     object
 3   Fields             22 non-null     object
 4   Institutions       20 non-null     object
 5   Thesis             12 non-null     object
 6   Doctoral advisor   17 non-null     object
 7   Website            10 non-null     object
 8   Alma mater         18 non-null     object
 9   Spouse             6 non-null      object
 10  Awards             17 non-null     object
 11  Doctoral students  4 non-null      object
 12  Citizenship        3 non-null      object
 13  Education          4 non-null      object
 14  Died               9 non-null      object
 15  Children           4 non-null      object
 16  Father             1 non-n

Unnamed: 0,Born,Nationality,Known for,Fields,Institutions,Thesis,Doctoral advisor,Website,Alma mater,Spouse,Awards,Doctoral students,Citizenship,Education,Died,Children,Father,Relatives,Occupation,Occupation(s),Spouses
Wil van der Aalst,"(1966-01-29) ,1966-01-29, (age 58),Eersel,Nor...",,"Workflow patterns,YAWL","Information systems,Workflow management,Petri ...",RWTH Aachen University,(1992),"Jaap Wessels,Kees van Hee",http://www.padsweb.rwth-aachen.de/wvdaalst/,,,,,,,,,,,,,
Scott Aaronson,"(1981-05-21) ,1981-05-21, (age 43),[,],[,],[1...",American,"Quantum Turing machine with postselection,Alge...","Computational complexity theory,quantum computing","University of Texas at Austin,Massachusetts In...",,Umesh Vazirani,"scottaaronson.blog,www.scottaaronson.com,scott...","Cornell University,University of California, B...",Dana Moshkovitz,"Alan T. Waterman Award,PECASE,Tomassoni–Chises...",,,,,,,,,,
Rediet Abebe,,,,"artificial intelligence,algorithms,computer sc...","University of California, Berkeley,Harvard Uni...",(2019),Jon M. Kleinberg,"www.cs.cornell.edu/~red/,md4sg.com,www.cs.corn...","Cornell University,University of Cambridge,Har...",,"Andrew Carnegie Fellow (2022),Harvard Society ...",,,,,,,,,,
Hal Abelson,"(1947-04-26) ,1947-04-26, (age 77),[,],[2]",,"[,],[,],Creative Commons,Public Knowledge,Free...","Computer science education,Amorphous computing",Massachusetts Institute of Technology,"(1973),Topologically Distinct Conjugate-Variet...","[,],Dennis Sullivan,[1]","www.csail.mit.edu/person/hal-abelson,,www.csai...","Princeton University (BA),Massachusetts Instit...",,SIGCSE Award for Outstanding Contribution to C...,"[,],[,],[,],Elizabeth Bradley[1],Mitchel Resni...",,,,,,,,,
Serge Abiteboul,"(1953-08-25) ,1953-08-25, (age 71),[,],[4],Paris",French,"[,],[,],[,],Abiteboul-Vianu Theorem[5][6],Data...","[,],Data bases,[2]",INRIA,"(1982),Matching Functions and Disaggregations ...","[,],Seymour Ginsburg,[3]","abiteboul.com,abiteboul.com",University of Southern California,,"[,],citation needed,when?,SIGMOD Edgar F. Codd...",,French,,,,,,,,


# Clean data

### Remove Unicode characters

In [22]:
df.columns = pd.Series(df.columns).apply(
    lambda x: x.replace('\xa0', ' ').strip() if type(x)==str else x
)

In [23]:
for c in df.columns:
    df[c] = df[c].apply(
        lambda x: x.replace('\xa0', ' ').strip() if type(x)==str else np.nan
    )
# end

### Drop sparse columns and rows

In [24]:
# Check original shape
df.shape

(25, 21)

In [25]:
# Remove columns with less than half of their rows non-null
df = df.dropna(thresh=df.shape[0]//2, axis=1)
df.shape

(25, 8)

In [26]:
# Remove rows with less than half of their columns non-null
df = df.dropna(thresh=df.shape[1]//2, axis=0)
df.shape

(21, 8)

### Extract dates from Born column

In [11]:
def fix_born( x ):
    if type(x) == str:
        matches = re.findall(r"\d{4}-\d{2}-\d{2}", x)
        if len(matches) > 0:
            return matches[0]
        # end
    # end

    return np.nan
# end

df.Born.apply( fix_born )

Wil van der Aalst              1966-01-29
Scott Aaronson                 1981-05-21
Rediet Abebe                          NaN
Hal Abelson                    1947-04-26
Serge Abiteboul                1953-08-25
Samson Abramsky                1953-03-12
Leonard Adleman                1945-12-31
Manindra Agrawal               1966-05-20
Luis von Ahn                   1978-08-19
Alfred Aho                            NaN
Frances E. Allen               1932-08-04
Gene Amdahl                    1922-11-16
David P. Anderson                     NaN
Andrew Appel                          NaN
Cecilia R. Aragon                     NaN
Bruce Arden                    1927-05-29
Sanjeev Arora                         NaN
Winifred "Tim" Alice Asprey    1917-04-08
John Vincent Atanasoff         1903-10-04
Shakuntala Atre                       NaN
Lennart Augustsson                    NaN
Charles Babbage                1791-12-26
Charles Bachman                1924-12-11
Roland Carl Backhouse          194

### Get Fields dummies

In [13]:
df.Fields.str.lower().str.get_dummies(",").sum(axis=0).sort_values(ascending=False)

computer science                       12
[                                       2
theoretical computer science            2
[2]                                     2
]                                       2
specification languages                 1
simulation                              1
quantum information and foundations     1
quantum information                     1
quantum computing                       1
inequality                              1
physics                                 1
petri nets                              1
parallel computing                      1
optimizing compilers                    1
mathematics                             1
information systems                     1
process mining                          1
high-performance computing              1
human-based computation games           1
game semantics                          1
functional programming                  1
foundations                             1
data bases                        