# Notebook 4: Download Occupations and Carrer Outlook Data from ONET API and Produce Nodes and Relations:

```
(OCCUPATION) NODE						occupation__node.csv
occupation_id:ID
occupation_title
onet_code
occupation_synonyms[]
occupation_description
occupation_salary
:LABEL = "OCCUPATION"

[HAS_FUTURE] RELATION					has_future__relation.csv
:START_ID = occupation_id
:END_ID = career_outlook_id
:TYPE = "HAS_FUTURE"

(CAREER_OUTLOOK) NODE					career_outlook__node.csv
career_outlook_id:ID
career_outlook
:LABEL = "CAREER_OUTLOOK"
```

## Imports

In [84]:
%pip install xmltodict

import requests
import json
import xmltodict
import pandas as pd

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [85]:
# this cell is to support running the notebook in Google Colab

mydrive = ""  # this is when we run locally

# Google Colab:
from google.colab import drive
drive.mount('/content/drive')
# mydrive = "/content/drive/MyDrive/DSE 203 — etl/DSE203_Project/"  # this is when we run on COLAB Leslie
mydrive = "/content/drive/MyDrive/DSE203/DSE203_Project/"  # this is when we run on COLAB Jessica
# mydrive = "/content/drive/MyDrive/DSE203_Project/"  # this is when we run on COLAB Sergey

input_dir = mydrive+"input_datasets/"
output_dir = mydrive+"output_datasets/"
temp_dir = mydrive+"temp_datasets/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Download Data from ONET API

In [86]:
response = requests.get('https://services.onetcenter.org/ws/online/occupations?start=1&end=1020', auth=('ucsd', '2835jxp'))
job_dict = xmltodict.parse(response.content)

job_codes = []
job_titles = []
for code in job_dict['occupations']['occupation']:
    job_codes += [code['code']]
    job_titles += [code['title']]
      
#dataframe of jobs
occupation_df = pd.DataFrame({'job_code': job_codes, 
                            'occupation_title': job_titles}) 


In [87]:
occupation_df.tail(5)

Unnamed: 0,job_code,occupation_title
1011,51-7099.00,"Woodworkers, All Other"
1012,51-7042.00,"Woodworking Machine Setters, Operators, and Te..."
1013,43-9022.00,Word Processors and Typists
1014,27-3043.00,Writers and Authors
1015,19-1023.00,Zoologists and Wildlife Biologists


In [88]:
occupation_synonyms = []
occupation_description = []
occupation_growth = []
occupation_salary = []

for job_code in occupation_df['job_code']:
    
    career_response = requests.get(f'https://services.onetcenter.org/ws/online/occupations/{job_code}/summary', auth=('ucsd', '2835jxp'))
    career_dict = xmltodict.parse(career_response.content)
    
    if 'summary_report' in career_dict and 'occupation' in career_dict['summary_report'] and 'sample_of_reported_job_titles' in career_dict['summary_report']['occupation']:
        occupation_synonyms += [career_dict['summary_report']['occupation']['sample_of_reported_job_titles']['title']]
    else:
        occupation_synonyms += ['']

    if 'summary_report' in career_dict:
        occupation_description += [career_dict['summary_report']['occupation']['description']]
    else:
        occupation_description += ['']
        
    report_response = requests.get(f'https://services.onetcenter.org/ws/mnm/careers/{job_code}/report', auth=('ucsd', '2835jxp'))
    report_dict = xmltodict.parse(report_response.content)
    if 'report' in report_dict:
        occupation_growth += [report_dict['report']['job_outlook']['outlook']['category']]
        if 'salary' in report_dict['report']['job_outlook']:
            if 'annual_median' in report_dict['report']['job_outlook']['salary']:
                occupation_salary += [report_dict['report']['job_outlook']['salary']['annual_median']]
            else:
                occupation_salary += ['']
        else:
            occupation_salary += ['']
    else: 
        occupation_growth += ['']
        occupation_salary += ['']


In [89]:
occupation_df['occupation_synonyms'] = occupation_synonyms
occupation_df['occupation_description'] = occupation_description
occupation_df['occupation_growth'] = occupation_growth
occupation_df['occupation_salary'] = occupation_salary
occupation_df.reset_index(inplace=True)
occupation_df = occupation_df.rename(columns={"index": "occupation_id:ID"})
occupation_df.head(5)

Unnamed: 0,occupation_id:ID,job_code,occupation_title,occupation_synonyms,occupation_description,occupation_growth,occupation_salary
0,0,13-2011.00,Accountants and Auditors,"[Accountant, Accounting Officer, Audit Partner...","Examine, analyze, and interpret accounting rec...",Bright,77250.0
1,1,27-2011.00,Actors,"[Actor, Actress, Comedian, Comic, Community Th...","Play parts in stage, television, radio, video,...",Bright,
2,2,15-2011.00,Actuaries,"[Actuarial Analyst, Actuarial Associate, Actua...","Analyze statistical data, such as mortality, a...",Bright,105900.0
3,3,29-1291.00,Acupuncturists,"[Acupuncture Physician, Acupuncture Provider, ...","Diagnose, treat, and prevent disorders by stim...",Average,60570.0
4,4,29-1141.01,Acute Care Nurses,"[Cardiac Interventional Care Nurse, Charge Nur...",Provide advanced nursing care for patients wit...,Bright,77600.0


## Save to Files

In [90]:
## Create job outlook nodes
career_outlook = occupation_df.occupation_growth.unique()
career_outlook_df = pd.DataFrame({'career_outlook': career_outlook}).dropna()
career_outlook_df = career_outlook_df[career_outlook_df.career_outlook != '']
career_outlook_df.reset_index(inplace=True)
career_outlook_df = career_outlook_df.rename(columns={"index": "career_outlook_id:ID", "career_growth": "career_outlook"})
career_outlook_df[':LABEL'] = "CAREER_OUTLOOK"

In [91]:
career_outlook_df.to_csv(output_dir+'career_outlook__node.csv', index=False)
career_outlook_df.tail(3)

Unnamed: 0,career_outlook_id:ID,career_outlook,:LABEL
0,0,Bright,CAREER_OUTLOOK
1,1,Average,CAREER_OUTLOOK
2,2,Below Average,CAREER_OUTLOOK


In [92]:
## create has_future relation
career_outlook_df = career_outlook_df.rename(columns={ "career_outlook": "occupation_growth"})
has_future_df = pd.merge(occupation_df, career_outlook_df, on='occupation_growth', how='inner')
has_future_df[':TYPE'] = 'HAS_FUTURE'
has_future_df = has_future_df[['occupation_id:ID', 'career_outlook_id:ID', ':TYPE']]
has_future_df = has_future_df.rename(columns={ 'occupation_id:ID': ":START_ID", 'career_outlook_id:ID': ":END_ID"})
# has_future_df.drop(has_future_df.columns[1], axis=1, inplace=True)
has_future_df.head(5)

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,0,0,HAS_FUTURE
1,1,0,HAS_FUTURE
2,2,0,HAS_FUTURE
3,4,0,HAS_FUTURE
4,10,0,HAS_FUTURE


In [93]:
has_future_df.to_csv(output_dir+'has_future__relation.csv',  index=False)
has_future_df.head(2)

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,0,0,HAS_FUTURE
1,1,0,HAS_FUTURE


In [94]:
occupation_df = occupation_df.loc[:, occupation_df.columns != 'occupation_growth']
occupation_df = occupation_df.rename(columns={"job_code": "onet_code"})
occupation_df[':LABEL'] = 'OCCUPATION'
occupation_df.tail(5)

Unnamed: 0,occupation_id:ID,onet_code,occupation_title,occupation_synonyms,occupation_description,occupation_salary,:LABEL
1011,1011,51-7099.00,"Woodworkers, All Other",,All woodworkers not listed separately.,,OCCUPATION
1012,1012,51-7042.00,"Woodworking Machine Setters, Operators, and Te...","[Boring Machine Operator, Cabinet Maker, Knot ...","Set up, operate, or tend woodworking machines,...",36090.0,OCCUPATION
1013,1013,43-9022.00,Word Processors and Typists,"[Clerk Specialist, Clerk Typist, Keyboard Spec...","Use word processor, computer, or typewriter to...",44030.0,OCCUPATION
1014,1014,27-3043.00,Writers and Authors,[Advertisement Agency Copywriter (Ad Agency Co...,"Originate and prepare written material, such a...",69510.0,OCCUPATION
1015,1015,19-1023.00,Zoologists and Wildlife Biologists,"[Aquatic Biologist, Conservation Resources Man...","Study the origins, behavior, diseases, genetic...",64650.0,OCCUPATION


In [95]:
occupation_df.to_csv(output_dir+'occupation__node.csv', index=False)