## Scrape raw data from MastersPortal, BachelorsPortal, PhdPortal & DistancelearningPortal
### (college degrees)

These functions create an autosave directory for each website so progress is recoverable in case of an error during scraping.

In [None]:
import time
import pandas as pd
from tqdm.notebook import tqdm
from selenium import webdriver
flatten = lambda l: [i for s in l for i in s]

In [None]:
%%capture
tqdm.pandas()

---

# Scrape

## masters

In [None]:
pages = 5094

driver = webdriver.Chrome()
base_url = f"https://www.mastersportal.com/search/#q=kw-data%20science|lv-master|tc-USD&start="
start = 0
autosave_n = 0
data = []
for start in tqdm(range(0, 5094, 10)):
    if start>0 and start%100==0:
        data = pd.DataFrame(data)
        autosave_n +=1
        print(f'\r{start/10}/5094 pages scraped (saving courses {start-100} – {start})', end='          ')
        data.to_csv(f'mastersportal/scrape_{autosave_n}.csv', index=False)
        data = []
        driver.quit()
        driver = webdriver.Chrome()
    url = base_url+str(start)
    driver.get(url)
    time.sleep(2)
    container = driver.find_element('id', 'StudySearchResultsStudies')
    courses = container.find_elements_by_class_name('InformationContainer')
    for c in courses:
        c_data = {}
        c_data['course_name'] = c.find_element_by_class_name('StudyTitle').text
        loc_dets = c.find_element_by_class_name('Location')
        loc_dets = loc_dets.find_elements_by_tag_name('span')
        c_data['institution'] = loc_dets[0].text
        c_data['location'] = loc_dets[1].text
        c_data['description'] = c.find_element_by_class_name('Description').text
        right_info = c.find_element_by_class_name('InformationRight')
        right_info = right_info.find_elements_by_tag_name('span')
        c_data['cost'] = right_info[0].text
        dur = right_info[1]
        c_data['dur_period'] = dur.get_attribute('data-period')
        c_data['dur_number'] = dur.get_attribute('data-duration')
        extra = c.find_element_by_class_name('ExtraFacts')
        extras = extra.find_elements_by_tag_name('span') # extra_n is a value for an unkown feature 
        for i, e in enumerate(extras):
            c_data[f'extra_{i}'] = e.text
        data.append(c_data) 
driver.quit()
data = pd.DataFrame(data)
autosave_n +=1
print(f'\rCompleted!\nScraped 5094 pages ({len(data)} courses)')
data.to_csv(f'mastersportal/scrape_{autosave_n}.csv', index=False)
print('File saved to mastersportal/')

### bachelors

In [None]:
driver = webdriver.Chrome()
base_url = f"https://www.bachelorsportal.com/search/#q=kw-data%20science|lv-bachelor|tc-EUR&start="
start = 0
autosave_n = 0

data = []
for start in tqdm(range(0, 5713, 10)):
    if start>0 and start%100==0:
        data = pd.DataFrame(data)
        autosave_n +=1
        print(f'Saving {start-100} – {start}')
        data.to_csv(f'bachelorsportal/scrape_{autosave_n}.csv', index=False)
        data = []
        driver.quit()
        driver = webdriver.Chrome()
    url = base_url+str(start)
    driver.get(url)
    time.sleep(1)
    container = driver.find_element('id', 'StudySearchResultsStudies')
    courses = container.find_elements_by_class_name('InformationContainer')
    tries = 0
    success = False
    while tries<3:
        tries+=1
        if success==True: break 
        try:
            for c in courses:
                c_data = {}
                c_data['course_name'] = c.find_element_by_class_name('StudyTitle').text
                loc_dets = c.find_element_by_class_name('Location')
                loc_dets = loc_dets.find_elements_by_tag_name('span')
                c_data['institution'] = loc_dets[0].text
                c_data['location'] = loc_dets[1].text
                c_data['description'] = c.find_element_by_class_name('Description').text
                right_info = c.find_element_by_class_name('InformationRight')
                right_info = right_info.find_elements_by_tag_name('span')
                c_data['cost'] = right_info[0].text
                dur = right_info[1]
                c_data['dur_period'] = dur.get_attribute('data-period')
                c_data['dur_number'] = dur.get_attribute('data-duration')
                extra = c.find_element_by_class_name('ExtraFacts')
                extras = extra.find_elements_by_tag_name('span')
                for i, e in enumerate(extras):
                    c_data[f'extra_{i}'] = e.text
                data.append(c_data) 
                success = True
        except:
            time.sleep(1)
driver.quit()
data = pd.DataFrame(data)
autosave_n +=1
print(f'Saving {start-100} – {start}')
data.to_csv(f'bachelorsportal/scrape_{autosave_n}.csv', index=False)
print('Done!')

### phd portal

In [None]:
driver = webdriver.Chrome()
base_url = f"https://www.phdportal.com/search/#q=kw-data%20science|lv-phd|tc-USD&start="
start = 0
autosave_n = 0

data = []
for start in tqdm(range(0, 358, 10)):
    if start>0 and start%100==0:
        data = pd.DataFrame(data)
        autosave_n +=1
        print(f'Saving {start-100} – {start}')
        data.to_csv(f'phdportal/scrape_{autosave_n}.csv', index=False)
        data = []
        driver.quit()
        driver = webdriver.Chrome()
    url = base_url+str(start)
    driver.get(url)
    time.sleep(1)
    container = driver.find_element('id', 'StudySearchResultsStudies')
    courses = container.find_elements_by_class_name('InformationContainer')
    tries = 0
    success = False
    while tries<3:
        tries+=1
        if success==True: break 
        try:
            for c in courses:
                c_data = {}
                c_data['course_name'] = c.find_element_by_class_name('StudyTitle').text
                loc_dets = c.find_element_by_class_name('Location')
                loc_dets = loc_dets.find_elements_by_tag_name('span')
                c_data['institution'] = loc_dets[0].text
                c_data['location'] = loc_dets[1].text
                c_data['description'] = c.find_element_by_class_name('Description').text
                right_info = c.find_element_by_class_name('InformationRight')
                right_info = right_info.find_elements_by_tag_name('span')
                c_data['cost'] = right_info[0].text
                dur = right_info[1]
                c_data['dur_period'] = dur.get_attribute('data-period')
                c_data['dur_number'] = dur.get_attribute('data-duration')
                extra = c.find_element_by_class_name('ExtraFacts')
                extras = extra.find_elements_by_tag_name('span')
                for i, e in enumerate(extras):
                    c_data[f'extra_{i}'] = e.text
                data.append(c_data) 
                success = True
        except:
            time.sleep(1)
driver.quit()
data = pd.DataFrame(data)
autosave_n +=1
print(f'Saving {start-100} – {start}')
data.to_csv(f'phdportal/scrape_{autosave_n}.csv', index=False)
print('Done!')

## distancelearning

In [None]:
driver = webdriver.Chrome()
base_url = f"https://www.distancelearningportal.com/search/#q=kw-data%20science|mh-blended,online|tc-EUR&start="
start = 0

data = []
for start in tqdm(range(0, 21, 10)):
    url = base_url+str(start)
    driver.get(url)
    time.sleep(1)
    container = driver.find_element('id', 'StudySearchResultsStudies')
    courses = container.find_elements_by_class_name('InformationContainer')
    tries = 0
    success = False
    while tries<3:
        tries+=1
        if success==True: break 
        try:
            for c in courses:
                c_data = {}
                c_data['course_name'] = c.find_element_by_class_name('StudyTitle').text
                loc_dets = c.find_element_by_class_name('Location')
                loc_dets = loc_dets.find_elements_by_tag_name('span')
                c_data['institution'] = loc_dets[0].text
                c_data['location'] = loc_dets[1].text
                c_data['description'] = c.find_element_by_class_name('Description').text
                right_info = c.find_element_by_class_name('InformationRight')
                right_info = right_info.find_elements_by_tag_name('span')
                c_data['cost'] = right_info[0].text
                dur = right_info[1]
                c_data['dur_period'] = dur.get_attribute('data-period')
                c_data['dur_number'] = dur.get_attribute('data-duration')
                extra = c.find_element_by_class_name('ExtraFacts')
                extras = extra.find_elements_by_tag_name('span')
                for i, e in enumerate(extras):
                    c_data[f'extra_{i}'] = e.text
                data.append(c_data) 
                success = True
        except:
            time.sleep(1)
driver.quit()
degrees = pd.DataFrame(data)
degrees.to_csv('scraped_mastersportal.csv', index=False)
'Done... saved.'

---
# Clean & Save

### combine autosave data from all websites into one dataframe

In [None]:
dirs = zip(["Bachelor's", "Master's", "Ph.D."],
           ['bachelorsportal', 'mastersportal', 'phdportal'])

out= pd.DataFrame()
for n, d in dirs:
    d_data = pd.DataFrame()
    for f in os.listdir(d):
        d_data = pd.concat([d_data,
                            pd.read_csv(f'{d}/{f}')],
                          sort=False)
    d_data['class'] = n
    out = pd.concat([out, d_data], sort=False)  

### drop probable false positives (non–DS programs)

In [None]:
hot_words = ['informat', 'proba', 'data', 'machine', 'geospat', 
             'analy', 'stat', 'artificial', 'intelligence', 'language']

out = out.reset_index(drop=True)
keep = []
for i, cn in enumerate(out.course_name):
    if any(w in cn.lower() for w in hot_words):
        keep.append(i)
        
out = out.loc[keep]

## clean & save

In [None]:
# work with a copy...
df = out

# shift columns if degree is missing
no_degree = df[df['extra_0'].str.contains('-time')].copy() # extra_0 skips to 'enrollment'
df.loc[no_degree.index, 'extra_2'] = no_degree['extra_1']
df.loc[no_degree.index, 'extra_1'] = no_degree['extra_0']
df.loc[no_degree.index, 'extra_0'] = '(?)' # unkown degree


# list all unknown columns (scattered values)  
e_cols = [e for e in df.columns if 'extra' in e]


for ridx in df.index: # iterate through courses (rows)
    row = df.loc[ridx] 
    
    # defaults:
    online='No' 
    blended='No' 
    enrollment = []
    
    # rules for all unknown values
    for col in e_cols: 
        val = row[col]
        if 'time' in str(val): enrollment.append(val)
        if 'online' in str(val).lower(): online = 'Yes'
        if 'blended' in str(val).lower(): online = 'Blended'
            
    # set column names
    df['degree'] = df['extra_0'].copy()    
    df.loc[ridx, 'enrollment'] = ' & '.join(enrollment)
    df.loc[ridx, 'online'] = online
    df.loc[ridx, 'in_person'] = online
    df.loc[ridx, 'blended_learning'] = blended

mp = df.drop(columns=e_cols)
    
mp.drop_duplicates(inplace=True)
mp.to_csv('portal_data.csv', index=False)
print('file saved.')
    
print(mp.shape)
mp.head()

In [None]:
import pandas as pd