In [None]:
# !pip install autoscraper python-dotenv pydash yarl

In [1]:
from dotenv import load_dotenv
import os
import requests
import json
from pydash import py_
from yarl import URL
import pandas as pd
import plotly.express as px

In [2]:
load_dotenv()
DIFFBOT_TOKEN = os.getenv("DIFFBOT_TOKEN")

# Top Universities

## Universities by Roles

In [3]:
def download_file(query: str, size: int = 25, extract_data: bool = True):
    """Download data from Diffbot"""
    query_string = f"type=query&token={DIFFBOT_TOKEN}&query={query}&size={size}"
    url = URL.build(
        scheme="https",
        host="kg.diffbot.com",
        path="/kg/dql_endpoint",
        query_string=query_string,
    )
    r = requests.get(url)
    response = r.json()
    return response["data"] if extract_data else response

In [4]:
top_universities = {}
positions = ['data scientist', 'machine learning engineer', 'data engineer']
for name in positions:
    top_universities[name] = download_file(
        'type:Person employments.title:or("data scientist") facet:educations.institution.name',
    )


In [5]:
def get_dataframe(data: list, name: str):
    data = data[name]
    education = py_.map(data, "value")
    count = py_.map(data, "count")
    df = pd.DataFrame({"education": education, 'value': count})
    df['position'] = name
    return df


In [7]:
universities = pd.concat([get_dataframe(top_universities, position)
            for position in positions
           ])


In [8]:
universities.to_pickle('data/universities.pkl')

In [9]:
px.bar(universities, x='education', y='value', color='position', title='Top 25 Universities for Data Professionals')

## Universities by Genders

In [10]:
universities_names = universities.education.unique()
# ds = download_file(
#     f'type:Person educations.institution.name:or{str(tuple(universities_names))} employments.title:or("data scientist")',
#     size = 26761
# )

In [6]:
# with open('data/data_scientist_by_uni.json', 'w') as f:
#     json.dump(ds, f)

with open('data/data_scientist_by_uni.json', 'r') as f:
    ds = json.load(f)

In [11]:
educations = [py_(person).get('educations').map('institution.name').filter(
    lambda x: x is not None).filter(lambda x: x.lower() in universities_names).value() for person in ds]

genders = [py_(person).get('gender.normalizedValue').value() for person in ds] 

In [12]:
university_gender = pd.DataFrame({'university': educations, 'gender': genders})
university_gender = university_gender.explode('university').dropna().reset_index(drop=True)

gender_count = university_gender.groupby(['university', 'gender']).size().to_frame(name='count').reset_index()

In [13]:
gender_count.to_pickle('data/gender_by_universities.pkl')

In [14]:
px.bar(gender_count, x='university', y='count', color='gender').update_xaxes(
    categoryorder='total descending')

# Top Bootcamps

## Bootcamps by Roles

In [1]:
from autoscraper import AutoScraper

url = "https://www.discoverdatascience.org/programs/data-science-bootcamps/"

wanted_list = ["Big Data Bootcamp"]

scraper = AutoScraper()
bootcamps1 = scraper.build(url, wanted_list)

bootcamps1.remove("Iris dataset")

In [3]:
url = "https://www.switchup.org/rankings/best-data-science-bootcamps"

wanted_list = ["WeCloudData"]

bootcamps2 = scraper.build(url, wanted_list)

In [4]:
bootcamps = set(bootcamps1).union(set(bootcamps2))
bootcamps = list(map(lambda x: x.lower(), bootcamps))

In [49]:
# ds_bootcamps = download_file(
#     f'type:Person educations.institution.name:or{str(tuple(bootcamps))} employments.title:or("data scientist") facet:educations.institution.name',
#     size=3423,
# )

# ml_bootcamps = download_file(
#     f'type:Person educations.institution.name:or{str(tuple(bootcamps))} employments.title:or("machine learning engineer") facet:educations.institution.name',
#     size=661,
# )

# de_bootcamps = download_file(
#     f'type:Person educations.institution.name:or{str(tuple(bootcamps))} employments.title:or("data engineer") facet:educations.institution.name',
#     size=976,
# )

In [18]:
names = ['ds_bootcamps', 'ml_bootcamps', 'de_bootcamps']
# data = [ds_bootcamps, ml_bootcamps, de_bootcamps]

# for name, d in zip(names, data):
#     with open('data/' + name + '.json', "w") as f:
#         json.dump(d, f)

data = {}
for name in names:
    with open('data/' + name + '.json', "r") as f:
        data[name] = json.load(f)


In [19]:
def get_bootcamp_dataframe(data: list, name: str, bootcamps: list):
    df = get_dataframe(data, name)
    return df[df.education.isin(bootcamps)]

In [20]:
df = pd.concat([get_bootcamp_dataframe(data, "de_bootcamps", bootcamps),
           get_bootcamp_dataframe(data, "ds_bootcamps", bootcamps),
           get_bootcamp_dataframe(data, "ml_bootcamps", bootcamps),
           ])


In [121]:
df.to_pickle('data/bootcamps.pkl')

In [21]:
names_map = {'de_bootcamps': 'data engineer',
             'ds_bootcamps': 'data scientist',
             'ml_bootcamps': 'machine learning engineer'}
df['position'] = df.position.map(names_map)


In [22]:
px.bar(df.reset_index(), x='education', y='value', color='position').update_xaxes(
    categoryorder='total descending')


## Bootcamps by Genders

In [None]:
# ds = download_file(
#     f'type:Person educations.institution.name:or{str(tuple(bootcamps))} employments.title:or("data scientist")',
#     size=3423,
# )

In [23]:
# with open('data/data_scientist.json', 'w') as f:
#     json.dump(ds, f)

with open('data/data_scientist.json', 'r') as f:
    ds = json.load(f)

In [24]:
educations = [py_(person).get('educations').map('institution.name').filter(
    lambda x: x is not None).filter(lambda x: x.lower() in bootcamps).value() for person in ds]

genders = [py_(person).get('gender.normalizedValue').value() for person in ds] 

In [25]:
bootcamp_gender = pd.DataFrame({'bootcamp': educations, 'gender': genders})
bootcamp_gender = bootcamp_gender.explode('bootcamp').dropna().reset_index(drop=True)

gender_count = bootcamp_gender.groupby(['bootcamp', 'gender']).size().to_frame(name='count').reset_index()

In [26]:
gender_count.to_pickle('data/gender_by_bootcamps.pkl')

In [27]:
px.bar(gender_count, x='bootcamp', y='count', color='gender').update_xaxes(
    categoryorder='total descending')