[![View on GitHub](https://img.shields.io/badge/GitHub-View_on_GitHub-blue?logo=GitHub)](https://github.com/khuyentran1401/Data-science/blob/master/visualization/analyze_data_science_market/analyze_data_science_market_diffbot.ipynb)

# Analyze Skills

In [None]:
# uncomment this cell to install all dependencies
# !pip install python-dotenv yarl observable_jupyter folium 

This articles uses the data extracted from [Diffbot](https://www.diffbot.com). You can get the free API token by signing up for the free 2 week trial. Find more instructions on how to use Diffbot [here](https://towardsdatascience.com/build-and-analyze-knowledge-graphs-with-diffbot-2af83065ade0).

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()
TOKEN = os.getenv("DIFFBOT_TOKEN")

In [None]:
import requests
from yarl import URL
from observable_jupyter import embed

import warnings

import pandas as pd
from pandas.core.common import SettingWithCopyWarning

import json

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [None]:
def download_file(query: str, size: int = 25, extract_data: bool = True):
    """Download data from Diffbot"""
    query_string = f"type=query&token={TOKEN}&query={query}&size={size}"
    url = URL.build(
        scheme="https",
        host="kg.diffbot.com",
        path="/kg/dql_endpoint",
        query_string=query_string,
    )
    r = requests.get(url)
    response = r.json()
    return response["data"] if extract_data else response

In [None]:
def create_skill_df(job_title: str):
    data = download_file(
        f'type:Person employments.title:"{job_title.lower()}" facet:skills.name',
        size=100,
    )
    df = pd.DataFrame.from_dict(data).drop(columns=["callbackQuery"])
    df["Title"] = job_title
    df = df.rename(columns={"value": "skill"})
    return df


skills = [
    create_skill_df(title)
    for title in [
        "Data Scientist",
        "Data Engineer",
        "Data Analyst",
        "Machine Learning Engineer",
    ]
]

In [None]:
skill_count = pd.concat(skills)

In [None]:
skill_count.to_csv("all_skills.csv")

In [None]:
languages = skill_count[
    skill_count["skill"].isin(
        ["python", "r", "sql", "c", "c++", "matlab", "java", "javascript"]
    )
]

languages["Ratio to All Skills"] = languages["count"].apply(
    lambda c: c / max_count_per_title["count"])
)

In [None]:
languages.to_csv("languages.csv")

In [None]:
embed("@khuyentran1401/languages-between-jobs", cells=["chart", "viewof options"])

<IPython.core.display.Javascript object>

# Analyze Education

## Analyze Major

In [None]:
from pipe import map, where


def get_top_major_per_position(title: str, num_top_majors: int = 10):
    data = download_file(
        f'type:Person employments.title:"{title}" facet:educations.major.name'
    )
    count = list(data | map(lambda res: res["count"]))[:num_top_majors]
    skills = list(data | map(lambda res: res["value"]))[:num_top_majors]
    return [{"name": name, "value": value} for name, value in zip(skills, count)]

<IPython.core.display.Javascript object>

In [None]:
titles = [
    "data scientist",
    "data engineer",
    "data analyst",
    "machine learning engineer",
    "statistician",
    "data entry",
]

<IPython.core.display.Javascript object>

In [None]:
majors_df = {title: pd.DataFrame(get_top_major_per_position(title)) for title in titles}

<IPython.core.display.Javascript object>

In [None]:
import pickle

pickle.dump(majors_df, open("majors_df.pkl", "wb"))

<IPython.core.display.Javascript object>

In [None]:
import plotly.express as px


def plot_majors(title: str, majors_df: dict):
    majors = majors_df[title]
    return px.bar(data_frame=majors, x="name", y="value")

<IPython.core.display.Javascript object>

In [None]:
interact(
    plot_majors,
    majors_df=fixed(majors_df),
    title=widgets.Dropdown(
        options=titles,
        value="data scientist",
        description="Role:",
        disabled=False,
    ),
)

interactive(children=(Dropdown(description='Role:', options=('data scientist', 'data engineer', 'data analyst'…

<function __main__.plot_majors(title: str, majors_df: dict)>

<IPython.core.display.Javascript object>

In [None]:
title_major = [
    {"name": title, "children": majors_df[title].to_dict("records")} for title in titles
]

title_major_all = {"name": "major", "children": title_major}

<IPython.core.display.Javascript object>

In [None]:
with open("majors.json", "w") as file:
    json.dump(title_major_all, file, indent=4)

<IPython.core.display.Javascript object>

In [None]:
embed("@khuyentran1401/majors-of-different-data-related-roles", cells=["chart"])

<IPython.core.display.Javascript object>

## Analyze Degree

In [None]:
def get_degree_count_per_position(title: str):
    data = download_file(
        f'type:Person employments.title:"{title}" facet:educations.degree.name', size=7
    )
    count = list(data | map(lambda res: res["count"]))
    degrees = list(data | map(lambda res: res["value"]))
    degrees_df = pd.DataFrame(
        [
            {"Title": title, "degree": name, "count": value}
            for name, value in zip(degrees, count)
        ]
    )
    return degrees_df[(degrees_df.degree != "-") & (degrees_df.degree != "diploma")]

<IPython.core.display.Javascript object>

In [None]:
degree_df = pd.concat(
    [get_degree_count_per_position(title) for title in titles]
).reset_index(drop=True)

<IPython.core.display.Javascript object>

In [None]:
degree_df.to_csv("raw_degrees.csv", index=False)

<IPython.core.display.Javascript object>

In [None]:
degree_df["Ratio to All Degrees"] = degree_df["count"] / max(degree_df["count"])

<IPython.core.display.Javascript object>

In [None]:
degree_df.degree = degree_df.degree.replace(
    {
        "bachelor's (4 year program)": "bachelor's",
        "master's (6 year program)": "master's",
        "phd or other doctorate": "phd",
        "associate's (2 year program)": "associate's",
        "high school or equivalent": "high school",
        "certificate/license": "certificate",
    }
)

<IPython.core.display.Javascript object>

In [None]:
degree_df.head(5)

Unnamed: 0,Title,degree,count,Ratio to All Degrees
0,data scientist,bachelor's,130696,0.302865
1,data scientist,master's,115193,0.26694
2,data scientist,phd,30825,0.071432
3,data scientist,high school,28612,0.066303
4,data scientist,certificate,8375,0.019408


<IPython.core.display.Javascript object>

In [None]:
degree_df.to_csv("degrees.csv", index=False)

<IPython.core.display.Javascript object>

In [None]:
embed(
    "@khuyentran1401/degrees-between-4-different-jobs",
    cells=["chart", "viewof options"],
)

<IPython.core.display.Javascript object>

In [None]:
embed(
    "@khuyentran1401/grouped-bar-chart-number-of-degrees-of-different-positions",
    cells=["chart"],
)

<IPython.core.display.Javascript object>

# Analyze Gender

In [None]:
def get_gender_count_per_position(title: str):
    data = download_file(
        f'type:Person employments.title:"{title}" facet:gender.normalizedValue', size=2
    )
    count = list(data | map(lambda res: res["count"]))
    genders = list(data | map(lambda res: res["value"]))
    return pd.DataFrame(
        [
            {"Title": title, "gender": name, "count": value}
            for name, value in zip(genders, count)
        ]
    )


<IPython.core.display.Javascript object>

In [None]:
genders = [get_gender_count_per_position(title) for title in titles]
gender_df = pd.concat(genders)

<IPython.core.display.Javascript object>

In [None]:
gender_df.to_csv("genders.csv", index=False)

<IPython.core.display.Javascript object>

In [None]:
embed(
    "@khuyentran1401/grouped-bar-chart-gender-of-different-positions",
    cells=["chart"],
)

<IPython.core.display.Javascript object>

# Analyze Locations

## Plot Top US States

In [None]:
r = requests.get(
    "https://gist.githubusercontent.com/khuyentran1401/bbfb8105227bb9a9c2bb21aa53a999c1/raw/efc76b261daa54b96f18299b039b224587dcf38e/state_name_map.json"
)
states_short = r.json()
states_short = {k.lower(): v for k, v in states_short.items()}

<IPython.core.display.Javascript object>

In [None]:
def get_jobs_US_states(title: str):
    data = download_file(
        f'type:Person employments.title:"{title}" ' + 'facet:locations.{region.name country.name:"United States of America"}', size=100
    )
    count = list(data | map(lambda res: res["count"]))
    states = list(data | map(lambda res: res["value"]))
    states = [state for state in states if state in states_short]
    return pd.DataFrame(
        [
            {"state": states_short[state], "count": value}
            for state, value in zip(states, count)
        ]
    )


<IPython.core.display.Javascript object>

In [None]:
state_jobs = get_jobs_US_states("data scientist")
state_jobs.to_csv("state_jobs.csv", index=False)

<IPython.core.display.Javascript object>

In [None]:
url = (
    "https://raw.githubusercontent.com/python-visualization/folium/master/examples/data"
)
state_geo = f"{url}/us-states.json"

m = folium.Map(location=[48, -102], zoom_start=3)

folium.Choropleth(
    geo_data=state_geo,
    name="choropleth",
    data=state_jobs,
    columns=["state", "count"],
    key_on="feature.id",
    fill_color="YlGn",
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name="Number of Data Science Jobs",
).add_to(m)

folium.LayerControl().add_to(m)

m

<IPython.core.display.Javascript object>

## Top 25 Cities

In [None]:
def get_locations(title: str, groupby: str):
    data = download_file(f'type:Person employments.title:"{title}" {groupby}')
    count = list(data | map(lambda res: res["count"]))
    locations = list(data | map(lambda res: res["value"]))
    return pd.DataFrame(
        [{"location": name, "count": value} for name, value in zip(locations, count)]
    )


def plot_top_25_locations(title: str, locations: dict):
    data = locations[title]
    return px.bar(data_frame=data, x="location", y="count")

<IPython.core.display.Javascript object>

In [None]:
cities = {
    title: pd.DataFrame(get_locations(title, "facet:locations.city.name"))
    for title in titles
}

<IPython.core.display.Javascript object>

In [None]:
pickle.dump(cities, open("cities.pkl", "wb"))

<IPython.core.display.Javascript object>

In [None]:
plot_top_25_locations('data scientist', cities)

interactive(children=(Dropdown(description='Role:', options=('data scientist', 'data engineer', 'data analyst'…

<function __main__.plot_top_25_locations(title: str, locations: dict)>

<IPython.core.display.Javascript object>

## Top 25 Countries

In [None]:
countries = {
    title: pd.DataFrame(get_locations(title, "facet:locations.country.name"))
    for title in titles
}

<IPython.core.display.Javascript object>

In [None]:
pickle.dump(countries, open("countries.pkl", "wb"))

<IPython.core.display.Javascript object>

In [None]:
plot_top_25_locations('data scientist', countries)

interactive(children=(Dropdown(description='Role:', options=('data scientist', 'data engineer', 'data analyst'…

<function __main__.plot_top_25_locations(title: str, locations: dict)>

<IPython.core.display.Javascript object>

# Analyze Titles

In [None]:
def get_titles_per_position(title: str):
    data = download_file(
        f'type:Person employments.title:"{title}" facet:employments.title', size=200
    )

    relevant_titles = list(data | where(lambda res: title in res["value"]))
    count = list(relevant_titles | map(lambda res: res["count"]))
    titles = list(relevant_titles | map(lambda res: res["value"]))
    return [{"name": name, "value": value} for name, value in zip(titles, count)]


def plot_top_titles(title: str, top_titles: dict):
    data = top_titles[title]
    return px.bar(data_frame=data, x="name", y="value")

<IPython.core.display.Javascript object>

In [None]:
top_titles = {title: pd.DataFrame(get_titles_per_position(title)) for title in titles}

<IPython.core.display.Javascript object>

In [None]:
pickle.dump(top_titles, open("top_titles.pkl", "wb"))

<IPython.core.display.Javascript object>

In [None]:
plot_top_titles('data scientist', top_titles)

interactive(children=(Dropdown(description='Role:', options=('data scientist', 'data engineer', 'data analyst'…

<function __main__.plot_top_titles(title: str, top_titles: dict)>

<IPython.core.display.Javascript object>

In [None]:
title_types = [
    {"name": title, "children": top_titles[title].to_dict("records")}
    for title in titles
]

title_types_all = {"name": "title types", "children": title_types}

<IPython.core.display.Javascript object>

In [None]:
with open("titles.json", "w") as file:
    json.dump(title_types_all, file, indent=4)

<IPython.core.display.Javascript object>

In [None]:
embed("@khuyentran1401/titles-of-different-data-related-roles", cells=["chart"])

<IPython.core.display.Javascript object>

# Analyze Trend

## Count of Positions Over Time

In [None]:
from datetime import datetime


def get_num_position_per_year(title: str, year: int):
    query = (
        "type:Person employments.{"
        + f'title:"{title}"from<"{year}-12-31" or(to>"{year}-01-01", isCurrent:true)'
        + "}"
    )
    count = download_file(
        query,
        size=0,
        extract_data=False,
    )["hits"]
    return {
        "date": datetime(year=year, month=1, day=1),
        "name": title,
        "value": count,
    }

<IPython.core.display.Javascript object>

In [None]:
titles = [
    "data scientist",
    "data engineer",
    "data analyst",
    "machine learning engineer",
    "statistician",
    "data entry",
]
years = list(range(1988, 2022))

<IPython.core.display.Javascript object>

In [None]:
dates = [get_num_position_per_year(title, year) for title in titles for year in years]

<IPython.core.display.Javascript object>

In [None]:
dates_df = pd.DataFrame(dates)

<IPython.core.display.Javascript object>

In [None]:
dates_df.sample(10)

Unnamed: 0,date,name,value
58,2012-01-01,data engineer,9941
128,2014-01-01,machine learning engineer,675
94,2014-01-01,data analyst,98123
21,2009-01-01,data scientist,2156
13,2001-01-01,data scientist,509
67,2021-01-01,data engineer,40559
135,2021-01-01,machine learning engineer,10422
96,2016-01-01,data analyst,113320
202,2020-01-01,data entry,41668
0,1988-01-01,data scientist,40


<IPython.core.display.Javascript object>

In [None]:
dates_df.to_csv("dates.csv")

<IPython.core.display.Javascript object>

In [None]:
embed(
    "@khuyentran1401/number-of-data-related-positions-over-time",
    cells=["viewof replay", "chart"],
)

<IPython.core.display.Javascript object>

In [None]:
px.line(dates_df, x="date", y="value", color="name")

<IPython.core.display.Javascript object>

## Count of Skills Over Time

In [None]:
def get_num_skill_per_year(title: str, year: int):
    query = (
        "type:Person employments.{"
        + f'title:"{title}"from<"{year}-12-31" or(to>"{year}-01-01", isCurrent:true)'
        + "} facet:skills.name"
    )
    data = download_file(query)
    count = list(data | map(lambda res: res["count"]))
    skills = list(data | map(lambda res: res["value"]))
    years = [datetime(year=year, month=1, day=1)] * len(data)
    skill_dict = list(zip(years, skills, count))
    return pd.DataFrame(skill_dict, columns=["date", "name", "value"])

<IPython.core.display.Javascript object>

In [None]:
dfs = [get_num_skill_per_year("data scientist", year) for year in years]

<IPython.core.display.Javascript object>

In [None]:
skills_df = pd.concat(dfs)

<IPython.core.display.Javascript object>

In [None]:
# Add the values of phython to the values of python
python_values = (
    skills_df[skills_df.name == "python"]
    .merge(skills_df[skills_df.name == "phython"], on="date", how="left")
    .fillna(0)
    .assign(total_value=lambda df_: df_.value_x + df_.value_y)["total_value"]
    .values
)

<IPython.core.display.Javascript object>

In [None]:
skills_df.loc[skills_df.name == "python", "value"] = python_values

<IPython.core.display.Javascript object>

In [None]:
skills_df = skills_df[skills_df.name != "phython"]

<IPython.core.display.Javascript object>

In [None]:
skills_df.head(10)

Unnamed: 0,date,name,value
0,1988-01-01,economics,34.0
1,1988-01-01,teaching,32.0
2,1988-01-01,management,30.0
3,1988-01-01,software development,16.0
4,1988-01-01,mathematics,12.0
5,1988-01-01,programming language,11.0
6,1988-01-01,data analysis,9.0
7,1988-01-01,program management,9.0
8,1988-01-01,project management,9.0
9,1988-01-01,business intelligence,8.0


<IPython.core.display.Javascript object>

In [None]:
skills_df.to_csv("skills_over_time.csv")

<IPython.core.display.Javascript object>

In [None]:
embed(
    "@khuyentran1401/number-of-data-scientists-skills-over-time",
    cells=["viewof replay", "chart"],
)

<IPython.core.display.Javascript object>

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=87197226-98be-42b2-8527-389082831299' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>