In [1]:
import requests
from bs4 import BeautifulSoup
import regex as re
import pandas as pd

In [2]:
url = 'https://www.mccormick.northwestern.edu/machine-learning-data-science/people/alumni/'

In [3]:
response = requests.get(url)
response.status_code
soup = BeautifulSoup(response.text)

In [4]:
# Get all the links for each class
class_links = []
for i in soup.select('[title*="Class of"]'):
   class_links.append("https://www.mccormick.northwestern.edu/machine-learning-data-science/people/alumni/" + i['href'])
    
class_links


['https://www.mccormick.northwestern.edu/machine-learning-data-science/people/alumni/class-of-2022.html',
 'https://www.mccormick.northwestern.edu/machine-learning-data-science/people/alumni/class-of-2021.html',
 'https://www.mccormick.northwestern.edu/machine-learning-data-science/people/alumni/class-of-2020.html',
 'https://www.mccormick.northwestern.edu/machine-learning-data-science/people/alumni/class-of-2019.html',
 'https://www.mccormick.northwestern.edu/machine-learning-data-science/people/alumni/class-of-2018.html',
 'https://www.mccormick.northwestern.edu/machine-learning-data-science/people/alumni/class-of-2017.html',
 'https://www.mccormick.northwestern.edu/machine-learning-data-science/people/alumni/class-of-2016.html',
 'https://www.mccormick.northwestern.edu/machine-learning-data-science/people/alumni/class-of-2015.html',
 'https://www.mccormick.northwestern.edu/machine-learning-data-science/people/alumni/class-of-2014.html',
 'https://www.mccormick.northwestern.edu/machi

In [5]:
# Create empty dictionary
info={}
# Loop through all the links
for link in class_links:
    response = requests.get(link)
    response.status_code
    soup = BeautifulSoup(response.text)


    for i in soup.find_all(class_='faculty cf'):
        name=i.find('h3') # Find student name     
        year=i.find('em') # Find studnet graduation year
        pattern = r'(.+?)\s+at(?:&nbsp;)?(.+)'  # Pattern to parse role and compnay
        
        try:
            role_company=i.find(class_='student-alumni-details') # Find job details
            match = re.search(pattern, role_company.text) # Parse the job detail
            name, currrent_role, company, year = name.text, match.group(1), match.group(2), int(re.findall(r'\d+', year.text)[0])
            info[name]=currrent_role, company, year # Add alum to the dictionary
        except AttributeError: # Exception when company and role is missing
            name, currrent_role, company, year = name.text, None, None, int(re.findall(r'\d+', year.text)[0])
            info[name]=currrent_role, company, year




In [6]:
# Convert dictionary to dataframe
df = pd.DataFrame(info.values(), columns=['current Role', 'current Company', 'Graduation Year'], index=info.keys())
df.reset_index(level=0, inplace=True)
df.rename(columns={'index': 'Name'}, inplace=True)

# Get rid of trailing spaces
df['current Company']= df['current Company'].str.strip()
df['current Role']= df['current Role'].str.strip()

df


Unnamed: 0,Name,current Role,current Company,Graduation Year
0,Alisher Akhatov,Data Scientist,The Trade Desk,2022
1,Haoyang (Bill) Cai,"Senior Analyst, Data Science & Analytics",TransUnion,2022
2,Qianyin (Charlotte) Cao,"Senior Analyst, Data Science & Analytics",TransUnion,2022
3,Bairui (Barry) Chen,Data Scientist,Walmart,2022
4,Narin Dhatwalia,"Senior Analyst, Data Science & Analytics",TransUnion,2022
...,...,...,...,...
381,Colin Watts-Fitzgerald,Lead Data Scientist,HERE Technologies,2013
382,Qifan Wu,"Director, Strategy Data Science",ByteDance,2013
383,Robert Yan Xue,Data Science Manager,Outreach,2013
384,Qi Yang,CEO,深圳酒香天下酒业有限公司,2013


What are the top 5 companies that alumni are working at?


In [7]:
# Consider name change: Facebook to Meta
df['current Company'] = df['current Company'].str.replace('Facebook', 'Meta')

(df.groupby(['current Company'])['Name']
 .count() 
 .sort_values(ascending=False)
 .reset_index()
 .head(5)
)

Unnamed: 0,current Company,Name
0,Meta,26
1,TransUnion,15
2,Amazon,11
3,Coupa Software,8
4,LinkedIn,6


What are two other interesting insights you can gather from the data?

1.
Which graduating class has students ending up with the most number of unique companies? 

In [8]:
(df.groupby(['Graduation Year'])['current Company']
 .nunique()
 .sort_values(ascending=False)
 .head(1)
 .reset_index()
)

Unnamed: 0,Graduation Year,current Company
0,2019,37


2. 
What are the top roles that MLDS students take on after graduation across the years?

In [9]:
(df.groupby(['Graduation Year', 'current Role'])['current Role']
 .count()
 .groupby('Graduation Year')
 .nlargest(1)
 .reset_index(level=1, drop=True)
)

Graduation Year  current Role            
2013             Principal Data Scientist     2
2014             Data Scientist               6
2015             Senior Data Scientist        5
2016             Data Scientist               6
2017             Data Scientist              10
2018             Data Scientist              16
2019             Data Scientist              14
2020             Data Scientist              24
2021             Data Scientist              14
2022             Data Scientist              14
Name: current Role, dtype: int64