# Notebook 5: Match Occupations and Listings and Produce Relations between them:

```
[BELONGS_TO] RELATION					belongs_to__relation.csv
:START_ID = listing_id
:END_ID =  occupation_id
:TYPE = "BELONGS_TO"
```

## Imports

In [1]:
# %pip install py_stringmatching
# %pip install xmltodict

import pandas as pd
import numpy as np
import xmltodict
import py_stringmatching as sm
import requests

In [2]:
# this cell is to support running the notebook in Google Colab 

mydrive = ""  # this is when we run locally

# Google Colab:
# from google.colab import drive
# drive.mount('/content/drive')
# mydrive = "/content/drive/MyDrive/DSE 203 — etl/DSE203_Project/"  # this is when we run on COLAB Leslie
# mydrive = "/content/drive/MyDrive/DSE203/DSE203_Project/"  # this is when we run on COLAB Jessica
# mydrive = "/content/drive/MyDrive/DSE203_Project/"  # this is when we run on COLAB Sergey

input_dir = mydrive+"input_datasets/"
output_dir = mydrive+"output_datasets/"
temp_dir = mydrive+"temp_datasets/"

## Read Data

In [3]:
## Create dataframes for skills and listings
needs_df = pd.read_csv(output_dir+'needs__relation.csv')
needs_df = needs_df.rename(columns={":START_ID": "listing_id", ":END_ID": "skill_id"})
skill_df = pd.read_csv(output_dir+'skill__node.csv')
skill_df = skill_df.rename(columns={"skill_id:ID": "skill_id"})
skill_df = skill_df.merge(needs_df, on='skill_id', how='left')
skill_df = skill_df[['listing_id', 'skill_name']]
listing_df = pd.read_csv(output_dir+'listing__node.csv')
listing_df = listing_df.rename(columns={"listing_id:ID": "listing_id"})

occupation_df =  pd.read_csv(output_dir+'occupation__node.csv')
occupation_df = occupation_df.rename(columns={"occupation_id:ID": 'occupation_id'})
occupation_df.tail(3)
print(skill_df.tail(2))
print(listing_df.tail(2))
occupation_df.tail(3)

     listing_id                               skill_name
180          13                         pdf resume pdfwe
181          13  skills, knowledge and abilities handson
    listing_id             listing_title  \
12          12  LMS (Saba) Administrator   
13          13       Front End Developer   

                                          description   :LABEL  
12  LMS SABA ADMINISTRATOR  Waltham, MAKelly Servi...  LISTING  
13  Job Title Front End DeveloperPosition SummaryT...  LISTING  


Unnamed: 0,occupation_id,onet_code,occupation_title,occupation_synonyms,occupation_description,occupation_salary,:LABEL
1013,1013,43-9022.00,Word Processors and Typists,"['Clerk Specialist', 'Clerk Typist', 'Keyboard...","Use word processor, computer, or typewriter to...",44030.0,OCCUPATION
1014,1014,27-3043.00,Writers and Authors,['Advertisement Agency Copywriter (Ad Agency C...,"Originate and prepare written material, such a...",69510.0,OCCUPATION
1015,1015,19-1023.00,Zoologists and Wildlife Biologists,"['Aquatic Biologist', 'Conservation Resources ...","Study the origins, behavior, diseases, genetic...",64650.0,OCCUPATION


## Preprocess before Matching

In [4]:
#explode the occupation dataframe in a way that all synonyms and titles are used as occupation_titles

# Grab all main job titles and job codes
# Some titles contain multiple titles so they are split by the comma
occupation_part_one = occupation_df[['occupation_id', 'occupation_title']]
occupation_part_one['occupation_title'] = occupation_part_one['occupation_title'].str.replace("and", ',')
occupation_part_one['occupation_title'] = occupation_part_one['occupation_title'].apply(lambda x: str(x).split(','))
occupation_part_one = occupation_part_one.explode('occupation_title')
occupation_part_one['occupation_title'] = occupation_part_one['occupation_title'].str.lower()
occupation_part_one.tail(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,occupation_id,occupation_title
1014,1014,authors
1015,1015,zoologists
1015,1015,wildlife biologists


## Matching and Saving to Output Files

In [5]:
# Match each synonym to respective job code
occupation_part_two = occupation_df[['occupation_id', 'occupation_synonyms']]
occupation_part_two['occupation_synonyms'] = occupation_part_two['occupation_synonyms'].str.replace("[", '')
occupation_part_two['occupation_synonyms'] = occupation_part_two['occupation_synonyms'].str.replace("]", '')
occupation_part_two['occupation_synonyms'] = occupation_part_two['occupation_synonyms'].str.replace("'", '')
occupation_part_two['occupation_synonyms'] = occupation_part_two['occupation_synonyms'].str.lower()
occupation_part_two['occupation_synonyms'] = occupation_part_two['occupation_synonyms'].apply(lambda x: str(x).split(','))
occupation_part_two = occupation_part_two.explode('occupation_synonyms')
occupation_part_two = occupation_part_two.rename(columns={"occupation_synonyms": "occupation_title"})

## Combine for df used for matching 
all_occupation_names_df = pd.concat([occupation_part_one, occupation_part_two], ignore_index=True)
all_occupation_names_df = all_occupation_names_df[all_occupation_names_df['occupation_title'] != ' ']
all_occupation_names_df.tail(3)

  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.htm

Unnamed: 0,occupation_id,occupation_title
9748,1015,migratory game bird biologist
9749,1015,wildlife biologist
9750,1015,zoologist


In [6]:
listing_clean = listing_df[['listing_id', 'listing_title']]
listing_clean['listing_title'] = listing_clean['listing_title'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
def jaccard_n_gram(df, test_string):
    """ function for calculating the jaccard similarity between the column 
    labeled 3_gram_string in the dataframe and the string passed as an argument"""
    jaccard_test_string = tok.tokenize(test_string.lower())
    jac = sm.Jaccard()
    df['jaccard_3_gram_score'] = df.apply(lambda row: jac.get_sim_score(row['3_gram_string'], jaccard_test_string), axis=1)
    max_id = df['jaccard_3_gram_score'].idxmax()
    return df.iloc[[max_id]]


all_occupation_names_df = pd.DataFrame(data = all_occupation_names_df[['occupation_id', 'occupation_title']]).reset_index()
tok = sm.QgramTokenizer(qval=3)
all_occupation_names_df['3_gram_string'] = all_occupation_names_df.apply(lambda row: tok.tokenize(row['occupation_title']), axis=1)


occupation_best_guess_df = pd.DataFrame( data = {'occupation_id': [],
                                        'occupation_title': [], 
                                        'listing_title': [],
                                        'listing_id': [],
                                        'jaccard_3_gram_score': []})


#Loop through every listing title and best match it to the exploded list of occupation titles
for i in range(len(listing_clean)):
    listing_title = listing_clean['listing_title'].iloc[i]
    listing_id = listing_clean['listing_id'].iloc[i]   
    highest_jaccard_score = jaccard_n_gram(all_occupation_names_df, listing_title)
    append_df = pd.DataFrame( data = {'occupation_id': [highest_jaccard_score['occupation_id'].iloc[0]],
                                          'occupation_title': [highest_jaccard_score['occupation_title'].iloc[0]], 
                                          'listing_title': [listing_title],
                                          'listing_id': [listing_id],
                                          'jaccard_3_gram_score': [highest_jaccard_score['jaccard_3_gram_score'].iloc[0]]})
    occupation_best_guess_df = pd.concat([occupation_best_guess_df, append_df], ignore_index=True)

occupation_best_guess_df.tail(3)

Unnamed: 0,occupation_id,occupation_title,listing_title,listing_id,jaccard_3_gram_score
11,117.0,data analyst,mdm data analyst,11.0,0.65
12,8.0,administrator,lms (saba) administrator,12.0,0.5
13,889.0,developer,front end developer,13.0,0.434783


In [8]:
##for the occupation_ids with a jaccard_3_gram_score <= .1, use the cleaned skills to request from ONET API the
##most matching the occupation title

good_guesses_df = occupation_best_guess_df[occupation_best_guess_df.jaccard_3_gram_score > .3]
bad_guesses_df = occupation_best_guess_df[occupation_best_guess_df.jaccard_3_gram_score <= .3]

for guess in range(len(bad_guesses_df)):
  listing_skills = skill_df[skill_df.listing_id == bad_guesses_df['listing_id'].iloc[guess]]
  listing_skills_string = ''
  for skill in listing_skills['skill_name']:
    listing_skills_string += ' '
    listing_skills_string += skill
  response = requests.get(f'https://services.onetcenter.org/ws/online/search?keyword={listing_skills_string}', auth=('ucsd', '2835jxp'))
  new_guess = xmltodict.parse(response.content)

  if 'occupation' in new_guess['occupations']:
    for index in new_guess['occupations']['occupation']:
      if isinstance(index, dict):
        onet_code = new_guess['occupations']['occupation'][0]['code']
        if len(occupation_df[occupation_df.onet_code == onet_code]['occupation_title']) > 0:
          new_occupation_title_guess = occupation_df[occupation_df.onet_code == onet_code]['occupation_title'].iloc[0]
          new_occupation_id_guess = occupation_df[occupation_df.onet_code == onet_code]['occupation_id'].iloc[0]
          bad_guesses_df['occupation_title'].iloc[guess] = new_occupation_title_guess
          bad_guesses_df['occupation_id'].iloc[guess] = new_occupation_id_guess

bad_guesses_df.tail(10)

belongs_to_df = pd.concat([good_guesses_df, bad_guesses_df]).reset_index()
belongs_to_df = belongs_to_df.rename(columns={"listing_id":"START_ID", "occupation_id": ":END_ID"})
belongs_to_df = belongs_to_df[["START_ID", ":END_ID"]]
belongs_to_df[":TYPE"] = "BELONGS_TO"
belongs_to_df.tail(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,START_ID,:END_ID,:TYPE
11,13.0,889.0,BELONGS_TO
12,4.0,889.0,BELONGS_TO
13,6.0,660.0,BELONGS_TO


In [9]:
belongs_to_df.to_csv(output_dir+'belongs_to__relation.csv', index=False)