In [None]:
!pip install python-dotenv pydash yarl

Packages in this code:

* graphistry: 0.20.5
* cugraph   : 21.10.0+0.g84617024.dirty
* cudf      : 21.10.0
* requests  : 2.26.0
* pandas    : 1.3.3

# Download Data

In [12]:
from dotenv import load_dotenv
import os
import graphistry
import cudf
import requests
from pydash import py_
import cudf
import cugraph
from yarl import URL

In [4]:
load_dotenv()
DIFFBOT_TOKEN = os.getenv("DIFFBOT_TOKEN")

In [13]:
def download_file(query: str, size: int = 25, extract_data: bool = True):
    """Download data from Diffbot"""
    query_string = f"type=query&token={DIFFBOT_TOKEN}&query={query}&size={size}"
    url = URL.build(
        scheme="https",
        host="kg.diffbot.com",
        path="/kg/dql_endpoint",
        query_string=query_string,
    )
    r = requests.get(url)
    response = r.json()
    return response["data"] if extract_data else response

In [14]:
data = download_file(
    'type:Person employments.{title:"data scientist" isCurrent:true} locations.country.name:"United States of America"',
    size=28667,
)

In [17]:
# import json

# with open("data_scientist.json", "w") as f:
#     json.dump(data, f)

# with open("data_scientist.json", "r") as f:
#     data = json.load(f)

# Process Data

In [19]:
names = [py_(person).get("name").value() for person in data]

In [188]:
employments = [
    py_(person)
    .get("employments")
    .filter({"isCurrent": True})
    .filter(lambda x: "Data Science" in py_(x).get("categories").map("name").value())
    .head()
    .value()
    for person in data
]

In [200]:
categories = [
    py_(employment).get("categories").map("name").value() for employment in employments
]

In [202]:
categories[0]

['Engineering, IT and Software Development', 'Data Science']

In [203]:
py_(employments[0]).get("title").value()

'Chief Data Scientist'

In [241]:
titles = [py_(employment).get("title").value() for employment in employments]
titles = [title.lower() if title else title for title in titles]

In [225]:
job_categories = set(py_.flatten(categories))
levels = ["Freelance", "Intern", "Senior", "Junior", "Student"]
roles = [
    "Backend Developer",
    "Board Member",
    "Chief Officer",
    "DevOps Developer",
    "Fullstack Developer or Architect",
    "Director",
    "Executive",
    "Frontend Developer",
    "Games Developer",
    "Leadership",
    "Manager",
    "Mobile Developer",
    "President" "Vice President",
    "Management",
    "Shareholder",
    "Founder",
    "CAO",
    "CEO",
    "CFO",
    "CIO",
    "CMO",
    "COO",
    "CTO",
    "Chairman",
]

In [226]:
skills = [py_(person).get("skills").map_("name").value() for person in data]

In [230]:
domain = [
    category
    for category in job_categories
    if category not in levels + roles and category
]

In [231]:
import pandas as pd

In [244]:
df = pd.DataFrame(
    {"person": names, "skills": skills, "job_categories": categories, "titles": titles}
)

In [245]:
df["level"] = df["job_categories"].apply(
    lambda row: [category for category in row if category in levels]
)

df["role"] = df["job_categories"].apply(
    lambda row: [category for category in row if category in roles]
)

df["domain"] = df["job_categories"].apply(
    lambda row: [category for category in row if category in domain]
)

df = df.drop(columns=["job_categories"]).reset_index(drop=True)

In [246]:
df.to_pickle("data_scientist.pkl")