# Notebook 1: Process Listings Dataset 
dice_com-job_us_sample.csv  
https://www.kaggle.com/datasets/PromptCloudHQ/us-technology-jobs-on-dicecom
#### This notebook produces the following data into the _output_datasets_ folder:
```
(LISTING) NODE						listing__node.csv
listing_id:ID
listing_title
description
:LABEL = "LISTING"

[NEEDS] RELATION					needs__relation.csv
:START_ID = listing_id
:END_ID = skill_id
:TYPE = "NEEDS"

[LOCATED_IN] RELATION					located_in__relation.csv
:START_ID = listing_id
:END_ID = location_id
:TYPE = "LOCATED_IN"

(LOCATION) NODE						location__node.csv
location_id:ID
location_name
:LABEL = "LOCATION"

[POSTED] RELATION					posted__relation.csv
:START_ID = company_id
:END_ID = listing_id
:TYPE = "POSTED"

(COMPANY) NODE						company__node.csv
company_id:ID
company_name
:LABEL = "COMPANY"
```
#### Also, it produces intermediate datasets, used for further Skill Matching steps into the _temp_datasets_ folder:
```
(LISTING_SKILL) NODE					listing_skills_TEMP.csv
listing_skill_id
listing_skill_name
```



# Setup

In [1]:
# %pip install stanza
# %pip install spacy
# %pip install nltk
# !python -m spacy download en_core_web_sm

from pathlib import Path
import pandas as pd
import numpy as np
import stanza
import spacy
import re

stanza.download('en') 
nlp_spacy = spacy.load("en_core_web_sm")
nlp_stanza = stanza.Pipeline('en', processors='tokenize, ner', use_gpu=False, pos_batch_size=3000, download_method=None)

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 24.2MB/s]
2022-12-10 11:30:30 INFO: Downloading default packages for language: en (English) ...
2022-12-10 11:30:31 INFO: File exists: /Users/sergeygurvich/stanza_resources/en/default.zip
2022-12-10 11:30:35 INFO: Finished downloading models and saved to /Users/sergeygurvich/stanza_resources.
2022-12-10 11:30:36 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2022-12-10 11:30:36 INFO: Use device: cpu
2022-12-10 11:30:36 INFO: Loading: tokenize
2022-12-10 11:30:36 INFO: Loading: ner
2022-12-10 11:30:37 INFO: Done loading processors!


In [2]:
# this cell is to support running the notebook in Google Colab

mydrive = ""  # this is when we run locally

# Google Colab:
# from google.colab import drive
# drive.mount('/content/drive')
# mydrive = "/content/drive/MyDrive/DSE 203 — etl/DSE203_Project/"  # this is when we run on COLAB Leslie
# mydrive = "/content/drive/MyDrive/DSE203_Project/"  # this is when we run on COLAB Sergey



Path("output_datasets").mkdir(parents=True, exist_ok=True)
Path("temp_datasets").mkdir(parents=True, exist_ok=True)


input_dir = mydrive+"input_datasets/"
output_dir = mydrive+"output_datasets/"
temp_dir = mydrive+"temp_datasets/"

### Import data

In [3]:
# SMALL, SAMPLED DATASET FOR TESTING:
# df = pd.read_csv(input_dir+"dice_small.csv")

# FULL DATASET:
df = pd.read_csv(input_dir+"dice_com-job_us_sample.csv")

df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   advertiserurl             15 non-null     object
 1   company                   15 non-null     object
 2   employmenttype_jobstatus  15 non-null     object
 3   jobdescription            15 non-null     object
 4   jobid                     15 non-null     object
 5   joblocation_address       15 non-null     object
 6   jobtitle                  15 non-null     object
 7   postdate                  15 non-null     object
 8   shift                     15 non-null     object
 9   site_name                 4 non-null      object
 10  skills                    15 non-null     object
 11  uniq_id                   15 non-null     object
dtypes: object(12)
memory usage: 1.5+ KB


Unnamed: 0,advertiserurl,company,employmenttype_jobstatus,jobdescription,jobid,joblocation_address,jobtitle,postdate,shift,site_name,skills,uniq_id
0,https://www.dice.com/jobs/detail/Salesforce-De...,Naztec International Group LLC,"Full Time, Contract Independent, Contract W2, ...","Location: Indianapolis, INDo not send fake pro...",Dice Id : 10119817,"Indianapolis, IN",Salesforce Developer,4 days ago,Telecommuting not available|Travel not required,,"Salesforce ,Apex, VisualForce, MobileSDK, Sale...",2df1ed967aa23d49a2828ddabe063a0d
1,https://www.dice.com/jobs/detail/Jr.-Support-E...,"Hatstand US, Inc.","Full Time, C2H W2, FTE","Hatstand, a Capital Markets Consultancy, is se...",Dice Id : 10368642,"Jersey City, NJ","Jr. Support Engineer - UNIX/Linux, Shell, Sql",3 weeks ago,Telecommuting not available|Travel not required,,UNIX/ Linux (Bash/Perl/Python/Shell) Scripting...,967c6445a147aca11fbb69db62af5d23
2,https://www.dice.com/jobs/detail/Applications-...,MACRO.CCS,Full Time,We're conducting a search for an Applications ...,Dice Id : macrosea,"Bellevue, WA",Applications Manager,1 week ago,Telecommuting not available|Travel not required,,Ability to analyze existing applications effec...,8c50e3a3fc62089bafa877a2d5f22136


### Set up NER to extract skills

In [4]:
def extract_entities_stanza(series):
    '''
    apply stanza to extract ORG and PRODUCT entities
    '''
    
    doc = nlp_stanza(series)
    entities_skills = doc.entities
    
    result = list({x.text for x in entities_skills if (x.type == 'ORG') or (x.type == 'PRODUCT')})
    
    return result

In [5]:
def extract_entities_spacy(series):
    '''
    apply spacy to extract ORG and PRODUCT entities
    '''

    doc = nlp_spacy(series, disable=["tok2vec", "parser"])
    entities_skills = doc.ents
    
    result = list({x.text for x in entities_skills if (x.label_ == 'ORG') or (x.label_ == 'PRODUCT')})
    
    return result

# Prep

### Only save columns we want

In [6]:
def basic_cleanup(df: pd.DataFrame):
    '''
    drop cols that are not useful to us
    '''
    to_drop = ['advertiserurl', 'employmenttype_jobstatus', 'jobid', 'uniq_id', 'postdate', 'shift', 'site_name']
    df = df.drop(to_drop, axis=1) \
           .drop_duplicates(subset=['company', 'joblocation_address', 'jobtitle']) \
           .rename(columns={'company':'company_name','jobdescription':'description','joblocation_address':'location_name','jobtitle':'listing_title'})
    return df

In [7]:
df = basic_cleanup(df)

df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15 entries, 0 to 14
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   company_name   15 non-null     object
 1   description    15 non-null     object
 2   location_name  15 non-null     object
 3   listing_title  15 non-null     object
 4   skills         15 non-null     object
dtypes: object(5)
memory usage: 720.0+ bytes


Unnamed: 0,company_name,description,location_name,listing_title,skills
0,Naztec International Group LLC,"Location: Indianapolis, INDo not send fake pro...","Indianapolis, IN",Salesforce Developer,"Salesforce ,Apex, VisualForce, MobileSDK, Sale..."
1,"Hatstand US, Inc.","Hatstand, a Capital Markets Consultancy, is se...","Jersey City, NJ","Jr. Support Engineer - UNIX/Linux, Shell, Sql",UNIX/ Linux (Bash/Perl/Python/Shell) Scripting...
2,MACRO.CCS,We're conducting a search for an Applications ...,"Bellevue, WA",Applications Manager,Ability to analyze existing applications effec...


### Clean `skills` as strings

In [8]:
def clean_skills(string):
    '''
    remove everything but letters, numbers, commas, +, #, and regular punctuation
    '''

    if type(string)!=str:
      return np.nan
    string = string.replace(' / ', ' ') \
                   .replace('/', ' ') \
                   .replace('... ', ', ') \
                   .replace('...', ', ')
    string = string.replace('  ', ', ')
    string = re.sub('[^a-zA-Z0-9,?!+# ]+', '', string)
    return string

In [9]:
def process_non_skills(skillcol):
    '''
    use empty strings (to fill later with jobdescription skills)
    '''

    mask = (skillcol.str.lower()=="null") | \
           (skillcol.str.lower().str.contains("see below")) | \
           (skillcol.str.lower()=="please see job description") | \
           (skillcol.str.lower()=="see job description") | \
           (skillcol.str.lower()=="see job overview") | \
           (skillcol.str.lower()=="full time") | \
           (skillcol.str.lower()=="please refer to job description") | \
           (skillcol.str.startswith("TAD PGS, INC specializes in")) #known offender of 60+ rows
    skillcol[mask] = ''
    return skillcol

In [10]:
df['skills'] = df['skills'].apply(clean_skills)
df['skills'] = process_non_skills(df['skills'])

df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15 entries, 0 to 14
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   company_name   15 non-null     object
 1   description    15 non-null     object
 2   location_name  15 non-null     object
 3   listing_title  15 non-null     object
 4   skills         15 non-null     object
dtypes: object(5)
memory usage: 720.0+ bytes


Unnamed: 0,company_name,description,location_name,listing_title,skills
0,Naztec International Group LLC,"Location: Indianapolis, INDo not send fake pro...","Indianapolis, IN",Salesforce Developer,"Salesforce ,Apex, VisualForce, MobileSDK, Sale..."
1,"Hatstand US, Inc.","Hatstand, a Capital Markets Consultancy, is se...","Jersey City, NJ","Jr. Support Engineer - UNIX/Linux, Shell, Sql","UNIX, Linux Bash Perl Python Shell Scripting SQL"
2,MACRO.CCS,We're conducting a search for an Applications ...,"Bellevue, WA",Applications Manager,Ability to analyze existing applications effec...


### Clean `Job Descriptions` only for rows that doesn't have any valid skills

In [11]:
def clean_job_description(string):
    '''
    remove everything but letters, numbers, commas, +, #, and regular punctuation
    '''
    
    if type(string)!=str:
      return np.nan
    string = string.replace('•','. ') \
                   .replace('\n','. ') \
                   .replace('...', '. ') \
                   .replace('\xa0','. ') \
                   .replace('\t', ' ')
    string = string.replace('  ', ' ')
    string = re.sub('[^a-zA-Z0-9,.?!+# ]+', '', string)
    return string

In [12]:
'''clean only job descriptions that don't have valid skills'''
df.description = df.apply(lambda row: clean_job_description(row.description) if row['skills']=='' else row.description, axis=1)

df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15 entries, 0 to 14
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   company_name   15 non-null     object
 1   description    15 non-null     object
 2   location_name  15 non-null     object
 3   listing_title  15 non-null     object
 4   skills         15 non-null     object
dtypes: object(5)
memory usage: 720.0+ bytes


Unnamed: 0,company_name,description,location_name,listing_title,skills
0,Naztec International Group LLC,"Location: Indianapolis, INDo not send fake pro...","Indianapolis, IN",Salesforce Developer,"Salesforce ,Apex, VisualForce, MobileSDK, Sale..."
1,"Hatstand US, Inc.","Hatstand, a Capital Markets Consultancy, is se...","Jersey City, NJ","Jr. Support Engineer - UNIX/Linux, Shell, Sql","UNIX, Linux Bash Perl Python Shell Scripting SQL"
2,MACRO.CCS,We're conducting a search for an Applications ...,"Bellevue, WA",Applications Manager,Ability to analyze existing applications effec...


# Format the `skills` column how we want it

### Convert `skills` string to list

In [13]:
def skills_to_list(df: pd.DataFrame):
    '''
    split lists along commas
    '''
    
    df = df.dropna()
    df['skills'] = df['skills'].apply(lambda x: x.split(',') if x!='' else [])
    return df

In [14]:
df = skills_to_list(df)

df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15 entries, 0 to 14
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   company_name   15 non-null     object
 1   description    15 non-null     object
 2   location_name  15 non-null     object
 3   listing_title  15 non-null     object
 4   skills         15 non-null     object
dtypes: object(5)
memory usage: 720.0+ bytes


Unnamed: 0,company_name,description,location_name,listing_title,skills
0,Naztec International Group LLC,"Location: Indianapolis, INDo not send fake pro...","Indianapolis, IN",Salesforce Developer,"[Salesforce , Apex, VisualForce, MobileSDK, ..."
1,"Hatstand US, Inc.","Hatstand, a Capital Markets Consultancy, is se...","Jersey City, NJ","Jr. Support Engineer - UNIX/Linux, Shell, Sql","[UNIX, Linux Bash Perl Python Shell Scripting..."
2,MACRO.CCS,We're conducting a search for an Applications ...,"Bellevue, WA",Applications Manager,[Ability to analyze existing applications effe...


### Clean `skills` as lists

Use NER to extract skills from longer sentences that were incorrectly save within the `skills` lists

In [15]:
def clean_list(row, threshold: int=6):
    '''
    strip whitespaces and get rid of empty entries
    also get rid of "skills" that are really just sentences, but extract ORG and PRODUCT from them first
    '''
    
    row = [r.strip() for r in row if r]
    
    '''extract ORG and PRODUCT from sentences'''
    one = [extract_entities_stanza(r) for r in row if len(r.split()) >= threshold] #using stanza
    two = [extract_entities_spacy(r) for r in row if len(r.split()) >= threshold] #using spacy
    '''combine with existing list of skills (without duplicates)'''
    x = one + two
    row = list(set(sum(x, row)))
    
    row = [r for r in row if (len(r.split()) < threshold)] #get rid of sentences in the list
    return row

In [16]:
def post_ner_fix(row):
    '''
    get rid of "travel" and "us government" skills that are leftover from improperly selected skills from using NER
    (things like "50 tavel" and "US Government Secret")
    '''
    row = [x for x in row if ("travel" not in x.lower() and "us government" not in x.lower())]
    return row

In [17]:
%%time
df['skills'] = df['skills'].apply(clean_list)
df['skills'] = df['skills'].apply(post_ner_fix)

df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15 entries, 0 to 14
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   company_name   15 non-null     object
 1   description    15 non-null     object
 2   location_name  15 non-null     object
 3   listing_title  15 non-null     object
 4   skills         15 non-null     object
dtypes: object(5)
memory usage: 720.0+ bytes
CPU times: user 3.39 s, sys: 773 ms, total: 4.16 s
Wall time: 652 ms


Unnamed: 0,company_name,description,location_name,listing_title,skills
0,Naztec International Group LLC,"Location: Indianapolis, INDo not send fake pro...","Indianapolis, IN",Salesforce Developer,[salesforce1 platform and Lightning components...
1,"Hatstand US, Inc.","Hatstand, a Capital Markets Consultancy, is se...","Jersey City, NJ","Jr. Support Engineer - UNIX/Linux, Shell, Sql",[UNIX]
2,MACRO.CCS,We're conducting a search for an Applications ...,"Bellevue, WA",Applications Manager,"[Security, EDI, MS stack]"


### Apply NER to extract skills from Job Description column, but only if we are missing them in Skills column

In [18]:
def extend_lists(df):
    '''
    This function will get the main dataframe and will combine lists of skills 
    from different columns, will remove duplicates and then will produce a final skills list.
    '''
    
    one  = df['description_stanza']
    two = df['description_spacy']
    three = df['skills']
    
    result = one + two + three

    # lowercase all skills
    result = [x.lower() for x in result]
    result = list(set(result))
    
    return result

In [19]:
%%time

'''extract with stanza'''
df['description_stanza'] = np.empty((len(df), 0)).tolist()
df['description_stanza'] = df.apply(lambda row: extract_entities_stanza(row.description) if row['skills']==[] else row.description_stanza, axis=1)

'''extract with spacy'''
df['description_spacy'] = np.empty((len(df), 0)).tolist()
df['description_spacy'] = df.apply(lambda row: extract_entities_spacy(row.description) if row['skills']==[] else row.description_stanza, axis=1)

'''combine everything together and remove duplicate skills'''
df['listing_skill_name'] = df.apply(extend_lists, axis=1)

df

CPU times: user 1min 41s, sys: 18.2 s, total: 1min 59s
Wall time: 17.2 s


Unnamed: 0,company_name,description,location_name,listing_title,skills,description_stanza,description_spacy,listing_skill_name
0,Naztec International Group LLC,"Location: Indianapolis, INDo not send fake pro...","Indianapolis, IN",Salesforce Developer,[salesforce1 platform and Lightning components...,[],[],"[salesforce, mobilesdk, salesforce1 platform a..."
1,"Hatstand US, Inc.","Hatstand, a Capital Markets Consultancy, is se...","Jersey City, NJ","Jr. Support Engineer - UNIX/Linux, Shell, Sql",[UNIX],[],[],[unix]
2,MACRO.CCS,We're conducting a search for an Applications ...,"Bellevue, WA",Applications Manager,"[Security, EDI, MS stack]",[],[],"[ms stack, security, edi]"
3,PROTECH,OverviewWe are seeking a Senior Java Developer...,"Coconut Creek, FL",Sr. Java Developer,"[OO, SQL Server, architecture, Spring Framewor...",[],[],"[sql server, experience gathering requirements..."
4,IT People Corporation,Experience building scalable mobile applicatio...,"Hartford, CT",Cloud Developer,"[AWS, Azure]",[],[],"[azure, aws]"
5,Analytic Recruiting Inc,Description:Investment Management - Web Develo...,"New York, NY","Buy Side -Web Developer - Javascript, HTML","[python, Web development, JavaScript, HTML]",[],[],"[python, html, javascript, web development]"
6,"24 Seven, Inc.",A well-known e-commerce company is in need of ...,"Seattle, WA",SQL / MySQL Developer,"[tableau, Python, BigQuery, Java, MySQL, analy...",[],[],"[python, tableau, java, sql, mysql, analyst, b..."
7,"Lodestar Consulting, LLC",PeopleSoft Test Manager opportunity!! Start: ...,"San Francisco, CA","PeopleSoft Testing Manager//San Francisco, CA.","[PeopleSoft Test Manager, testing]",[],[],"[peoplesoft test manager, testing]"
8,Progressive Technology Solutions,Please submit resume to vishakha AT PTSOL dot...,"Woodland Hills, CA",IT System Administrator,"[dns, Administrator]",[],[],"[administrator, dns]"
9,"firstPRO, Inc.",Business AnalystThis is an experience business...,"Mount Laurel, NJ","Business Analyst, IT","[analytics, SDLC, IT business analyst, documen...",[],[],"[analytics, it ba, it business analyst, sdlc, ..."


In [20]:
############### Sergey: this removes some good stuff too, like [unix] ##################################

def remove_empty_skills(df: pd.DataFrame):
    '''
    get rid of all skill-lists that are 1 item long
    '''
    
    df['skills_count'] = df['listing_skill_name'].apply(lambda x: len(x))
    df = df[df['skills_count']>1] \
          .drop(columns=['skills_count'])
    return df

In [21]:
'''last bit of pruning empty skillsets (and skillsets of size 1)'''
df = remove_empty_skills(df)

'''drop rows with empty listing_skill_name (empty list or null)'''
df = df[df['listing_skill_name'].astype(bool)] \
       .dropna(subset=['listing_skill_name']) \
       .reset_index(drop=True).reset_index() \
       .rename(columns={'index':'listing_id:ID'})

df

Unnamed: 0,listing_id:ID,company_name,description,location_name,listing_title,skills,description_stanza,description_spacy,listing_skill_name
0,0,Naztec International Group LLC,"Location: Indianapolis, INDo not send fake pro...","Indianapolis, IN",Salesforce Developer,[salesforce1 platform and Lightning components...,[],[],"[salesforce, mobilesdk, salesforce1 platform a..."
1,1,MACRO.CCS,We're conducting a search for an Applications ...,"Bellevue, WA",Applications Manager,"[Security, EDI, MS stack]",[],[],"[ms stack, security, edi]"
2,2,PROTECH,OverviewWe are seeking a Senior Java Developer...,"Coconut Creek, FL",Sr. Java Developer,"[OO, SQL Server, architecture, Spring Framewor...",[],[],"[sql server, experience gathering requirements..."
3,3,IT People Corporation,Experience building scalable mobile applicatio...,"Hartford, CT",Cloud Developer,"[AWS, Azure]",[],[],"[azure, aws]"
4,4,Analytic Recruiting Inc,Description:Investment Management - Web Develo...,"New York, NY","Buy Side -Web Developer - Javascript, HTML","[python, Web development, JavaScript, HTML]",[],[],"[python, html, javascript, web development]"
5,5,"24 Seven, Inc.",A well-known e-commerce company is in need of ...,"Seattle, WA",SQL / MySQL Developer,"[tableau, Python, BigQuery, Java, MySQL, analy...",[],[],"[python, tableau, java, sql, mysql, analyst, b..."
6,6,"Lodestar Consulting, LLC",PeopleSoft Test Manager opportunity!! Start: ...,"San Francisco, CA","PeopleSoft Testing Manager//San Francisco, CA.","[PeopleSoft Test Manager, testing]",[],[],"[peoplesoft test manager, testing]"
7,7,Progressive Technology Solutions,Please submit resume to vishakha AT PTSOL dot...,"Woodland Hills, CA",IT System Administrator,"[dns, Administrator]",[],[],"[administrator, dns]"
8,8,"firstPRO, Inc.",Business AnalystThis is an experience business...,"Mount Laurel, NJ","Business Analyst, IT","[analytics, SDLC, IT business analyst, documen...",[],[],"[analytics, it ba, it business analyst, sdlc, ..."
9,9,"Digital Intelligence Systems, LLC",Looking for Selenium engineers. must have soli...,"Atlanta, GA",AUTOMATION TEST ENGINEER,[],"[Groovy, eCommerceRetail QA, Lisa, SVN, Networ...","[Groovy, SVN, Networking Voice, Websphere MQ, ...","[selenium, lan, sap, websphere application ser..."


In [22]:
'''save a copy just in case'''
df[['listing_id:ID', 'company_name', 'description', 'location_name', 'listing_title', 'listing_skill_name']] \
  .to_csv(temp_dir+"templistingdataframe.csv", index=False)

# Saving nodes and relations as CSVs for Neo4j

### (node) LISTING

In [23]:
listing_df = df[['listing_id:ID','listing_title','description']].copy()
listing_df[':LABEL'] = "LISTING"
listing_df

Unnamed: 0,listing_id:ID,listing_title,description,:LABEL
0,0,Salesforce Developer,"Location: Indianapolis, INDo not send fake pro...",LISTING
1,1,Applications Manager,We're conducting a search for an Applications ...,LISTING
2,2,Sr. Java Developer,OverviewWe are seeking a Senior Java Developer...,LISTING
3,3,Cloud Developer,Experience building scalable mobile applicatio...,LISTING
4,4,"Buy Side -Web Developer - Javascript, HTML",Description:Investment Management - Web Develo...,LISTING
5,5,SQL / MySQL Developer,A well-known e-commerce company is in need of ...,LISTING
6,6,"PeopleSoft Testing Manager//San Francisco, CA.",PeopleSoft Test Manager opportunity!! Start: ...,LISTING
7,7,IT System Administrator,Please submit resume to vishakha AT PTSOL dot...,LISTING
8,8,"Business Analyst, IT",Business AnalystThis is an experience business...,LISTING
9,9,AUTOMATION TEST ENGINEER,Looking for Selenium engineers. must have soli...,LISTING


In [24]:
listing_df.to_csv(output_dir+"listing__node.csv", index=False)

### (temporary node) LISTING_SKILL

In [25]:
listing_skill_df = df[['listing_skill_name']].copy() \
                     .explode('listing_skill_name') \
                     .drop_duplicates() \
                     .reset_index(drop=True).reset_index() \
                     .rename(columns={'index':'listing_skill_id'})

listing_skill_df

Unnamed: 0,listing_skill_id,listing_skill_name
0,0,salesforce
1,1,mobilesdk
2,2,salesforce1 platform and lightning components
3,3,lightning connect
4,4,apex
...,...,...
164,164,reactjs
165,165,tdd testdriven
166,166,mvcdesign
167,167,pdf resume pdfwe


In [26]:
listing_skill_df.to_csv(temp_dir+"listing_skills_TEMP.csv", index=False)

### (node) LOCATION

In [27]:
location_df = df[['location_name']].copy() \
                .drop_duplicates() \
                .reset_index(drop=True).reset_index() \
                .rename(columns={'index':'location_id:ID'})
location_df[':LABEL'] = "LOCATION"
location_df

Unnamed: 0,location_id:ID,location_name,:LABEL
0,0,"Indianapolis, IN",LOCATION
1,1,"Bellevue, WA",LOCATION
2,2,"Coconut Creek, FL",LOCATION
3,3,"Hartford, CT",LOCATION
4,4,"New York, NY",LOCATION
5,5,"Seattle, WA",LOCATION
6,6,"San Francisco, CA",LOCATION
7,7,"Woodland Hills, CA",LOCATION
8,8,"Mount Laurel, NJ",LOCATION
9,9,"Atlanta, GA",LOCATION


In [28]:
location_df.to_csv(output_dir+"location__node.csv", index=False)

### (node) COMPANY

In [29]:
company_df = df[['company_name']].copy() \
                .drop_duplicates() \
                .reset_index(drop=True).reset_index() \
                .rename(columns={'index':'company_id:ID'})
company_df[':LABEL'] = "COMPANY"
company_df

Unnamed: 0,company_id:ID,company_name,:LABEL
0,0,Naztec International Group LLC,COMPANY
1,1,MACRO.CCS,COMPANY
2,2,PROTECH,COMPANY
3,3,IT People Corporation,COMPANY
4,4,Analytic Recruiting Inc,COMPANY
5,5,"24 Seven, Inc.",COMPANY
6,6,"Lodestar Consulting, LLC",COMPANY
7,7,Progressive Technology Solutions,COMPANY
8,8,"firstPRO, Inc.",COMPANY
9,9,"Digital Intelligence Systems, LLC",COMPANY


In [30]:
company_df.to_csv(output_dir+"company__node.csv", index=False)

### [relation] NEEDS

In [31]:
needs_df = df[['listing_id:ID', 'listing_skill_name']].copy()
needs_df = needs_df.explode('listing_skill_name') \
                   .dropna() \
                   .merge(listing_skill_df, on='listing_skill_name') \
                   .drop(columns=['listing_skill_name']) \
                   .rename(columns={'listing_id:ID':':START_ID', 'listing_skill_id':':END_ID'})
needs_df[':TYPE'] = "NEEDS"
needs_df

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,0,0,NEEDS
1,0,1,NEEDS
2,0,2,NEEDS
3,0,3,NEEDS
4,0,4,NEEDS
...,...,...,...
177,13,164,NEEDS
178,13,165,NEEDS
179,13,166,NEEDS
180,13,167,NEEDS


In [32]:
needs_df.to_csv(output_dir+"needs__relation.csv", index=False)

### [relation] LOCATED_IN

In [33]:
located_in_df = df[['listing_id:ID', 'location_name']].copy()
located_in_df = located_in_df.dropna() \
                   .merge(location_df, on='location_name') \
                   .drop(columns=['location_name', ':LABEL']) \
                   .rename(columns={'listing_id:ID':':START_ID', 'location_id:ID':':END_ID'})
located_in_df[':TYPE'] = "LOCATED_IN"
located_in_df

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,0,0,LOCATED_IN
1,1,1,LOCATED_IN
2,2,2,LOCATED_IN
3,3,3,LOCATED_IN
4,4,4,LOCATED_IN
5,13,4,LOCATED_IN
6,5,5,LOCATED_IN
7,6,6,LOCATED_IN
8,7,7,LOCATED_IN
9,8,8,LOCATED_IN


In [34]:
located_in_df.to_csv(output_dir+"located_in__relation.csv", index=False)

### [relation] POSTED

In [35]:
posted_df = df[['listing_id:ID', 'company_name']].copy()
posted_df = posted_df.dropna() \
                   .merge(company_df, on='company_name') \
                   .drop(columns=['company_name', ':LABEL']) \
                   .rename(columns={'company_id:ID':':START_ID', 'listing_id:ID':':END_ID'})
posted_df[':TYPE'] = "POSTED"
posted_df

Unnamed: 0,:END_ID,:START_ID,:TYPE
0,0,0,POSTED
1,1,1,POSTED
2,2,2,POSTED
3,3,3,POSTED
4,4,4,POSTED
5,5,5,POSTED
6,6,6,POSTED
7,7,7,POSTED
8,8,8,POSTED
9,9,9,POSTED


In [36]:
posted_df.to_csv(output_dir+"posted__relation.csv", index=False)