# Notebook 1: Process Listings Dataset 
dice_com-job_us_sample.csv  
https://www.kaggle.com/datasets/PromptCloudHQ/us-technology-jobs-on-dicecom
#### This notebook produces the following data into the _output_datasets_ folder:
```
(LISTING) NODE						listing__node.csv
listing_id:ID
listing_title
description
:LABEL = "LISTING"

[NEEDS] RELATION					needs__relation.csv
:START_ID = listing_id
:END_ID = skill_id
:TYPE = "NEEDS"

[LOCATED_IN] RELATION					located_in__relation.csv
:START_ID = listing_id
:END_ID = location_id
:TYPE = "LOCATED_IN"

(LOCATION) NODE						location__node.csv
location_id:ID
location_name
:LABEL = "LOCATION"

[POSTED] RELATION					posted__relation.csv
:START_ID = company_id
:END_ID = listing_id
:TYPE = "POSTED"

(COMPANY) NODE						company__node.csv
company_id:ID
company_name
:LABEL = "COMPANY"
```
#### Also, it produces intermediate datasets, used for further Skill Matching steps into the _temp_datasets_ folder:
```
(LISTING_SKILL) NODE					listing_skills_TEMP.csv
listing_skill_id
listing_skill_name
```



# Setup

In [None]:
%pip install stanza
%pip install spacy
%pip install nltk
!python -m spacy download en_core_web_sm

import pandas as pd
import numpy as np
import stanza
import spacy
import re

stanza.download('en') 
nlp_spacy = spacy.load("en_core_web_sm")
nlp_stanza = stanza.Pipeline('en', processors='tokenize, ner', use_gpu=False, pos_batch_size=3000, download_method=None)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2022-12-06 19:06:20.815038: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 5.2 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

INFO:stanza:Downloading default packages for language: en (English) ...
INFO:stanza:File exists: /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources.
INFO:stanza:Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

INFO:stanza:Use device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


In [None]:
# this cell is to support running the notebook in Google Colab

mydrive = ""  # this is when we run locally

# Google Colab:
from google.colab import drive
drive.mount('/content/drive')
mydrive = "/content/drive/MyDrive/DSE 203 — etl/DSE203_Project/"  # this is when we run on COLAB Leslie
mydrive = "/content/drive/MyDrive/DSE203_Project/"  # this is when we run on COLAB Sergey

input_dir = mydrive+"input_datasets/"
output_dir = mydrive+"output_datasets/"
temp_dir = mydrive+"temp_datasets/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Import data

In [None]:
'''import dataset from Google Drive'''

df = pd.read_csv(input_dir+"dice_com-job_us_sample.csv")
# df = pd.read_csv(input_dir+"dice_small.csv")

df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22000 entries, 0 to 21999
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   advertiserurl             22000 non-null  object
 1   company                   21950 non-null  object
 2   employmenttype_jobstatus  21770 non-null  object
 3   jobdescription            22000 non-null  object
 4   jobid                     22000 non-null  object
 5   joblocation_address       21997 non-null  object
 6   jobtitle                  22000 non-null  object
 7   postdate                  22000 non-null  object
 8   shift                     21643 non-null  object
 9   site_name                 3490 non-null   object
 10  skills                    21957 non-null  object
 11  uniq_id                   22000 non-null  object
dtypes: object(12)
memory usage: 2.0+ MB


Unnamed: 0,advertiserurl,company,employmenttype_jobstatus,jobdescription,jobid,joblocation_address,jobtitle,postdate,shift,site_name,skills,uniq_id
0,https://www.dice.com/jobs/detail/AUTOMATION-TE...,"Digital Intelligence Systems, LLC","C2H Corp-To-Corp, C2H Independent, C2H W2, 3 M...",Looking for Selenium engineers...must have sol...,Dice Id : 10110693,"Atlanta, GA",AUTOMATION TEST ENGINEER,1 hour ago,Telecommuting not available|Travel not required,,SEE BELOW,418ff92580b270ef4e7c14f0ddfc36b4
1,https://www.dice.com/jobs/detail/Information-S...,University of Chicago/IT Services,Full Time,The University of Chicago has a rapidly growin...,Dice Id : 10114469,"Chicago, IL",Information Security Engineer,1 week ago,Telecommuting not available|Travel not required,,"linux/unix, network monitoring, incident respo...",8aec88cba08d53da65ab99cf20f6f9d9
2,https://www.dice.com/jobs/detail/Business-Solu...,"Galaxy Systems, Inc.",Full Time,"GalaxE.SolutionsEvery day, our solutions affec...",Dice Id : CXGALXYS,"Schaumburg, IL",Business Solutions Architect,2 weeks ago,Telecommuting not available|Travel not required,,"Enterprise Solutions Architecture, business in...",46baa1f69ac07779274bcd90b85d9a72


### Set up NER to extract skills

In [None]:
def extract_entities_stanza(series):
    '''
    apply stanza to extract ORG and PRODUCT entities
    '''
    
    doc = nlp_stanza(series)
    entities_skills = doc.entities
    
    result = list({x.text for x in entities_skills if (x.type == 'ORG') or (x.type == 'PRODUCT')})
    
    return result

In [None]:
def extract_entities_spacy(series):
    '''
    apply spacy to extract ORG and PRODUCT entities
    '''

    doc = nlp_spacy(series, disable=["tok2vec", "parser"])
    entities_skills = doc.ents
    
    result = list({x.text for x in entities_skills if (x.label_ == 'ORG') or (x.label_ == 'PRODUCT')})
    
    return result

# Prep

### Only save columns we want

In [None]:
def basic_cleanup(df: pd.DataFrame):
    '''
    drop cols that are not useful to us
    '''
    to_drop = ['advertiserurl', 'employmenttype_jobstatus', 'jobid', 'uniq_id', 'postdate', 'shift', 'site_name']
    df = df.drop(to_drop, axis=1) \
           .drop_duplicates(subset=['company', 'joblocation_address', 'jobtitle']) \
           .rename(columns={'company':'company_name','jobdescription':'description','joblocation_address':'location_name','jobtitle':'listing_title'})
    return df

In [None]:
df = basic_cleanup(df)

df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20372 entries, 0 to 21999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   company_name   20323 non-null  object
 1   description    20372 non-null  object
 2   location_name  20369 non-null  object
 3   listing_title  20372 non-null  object
 4   skills         20338 non-null  object
dtypes: object(5)
memory usage: 954.9+ KB


Unnamed: 0,company_name,description,location_name,listing_title,skills
0,"Digital Intelligence Systems, LLC",Looking for Selenium engineers...must have sol...,"Atlanta, GA",AUTOMATION TEST ENGINEER,SEE BELOW
1,University of Chicago/IT Services,The University of Chicago has a rapidly growin...,"Chicago, IL",Information Security Engineer,"linux/unix, network monitoring, incident respo..."
2,"Galaxy Systems, Inc.","GalaxE.SolutionsEvery day, our solutions affec...","Schaumburg, IL",Business Solutions Architect,"Enterprise Solutions Architecture, business in..."


### Clean `skills` as strings

In [None]:
def clean_skills(string):
    '''
    remove everything but letters, numbers, commas, +, #, and regular punctuation
    '''

    if type(string)!=str:
      return np.nan
    string = string.replace(' / ', ' ') \
                   .replace('/', ' ') \
                   .replace('... ', ', ') \
                   .replace('...', ', ')
    string = string.replace('  ', ', ')
    string = re.sub('[^a-zA-Z0-9,?!+# ]+', '', string)
    return string

In [None]:
def process_non_skills(skillcol):
    '''
    use empty strings (to fill later with jobdescription skills)
    '''

    mask = (skillcol.str.lower()=="null") | \
           (skillcol.str.lower().str.contains("see below")) | \
           (skillcol.str.lower()=="please see job description") | \
           (skillcol.str.lower()=="see job description") | \
           (skillcol.str.lower()=="see job overview") | \
           (skillcol.str.lower()=="full time") | \
           (skillcol.str.lower()=="please refer to job description") | \
           (skillcol.str.startswith("TAD PGS, INC specializes in")) #known offender of 60+ rows
    skillcol[mask] = ''
    return skillcol

In [None]:
df['skills'] = df['skills'].apply(clean_skills)
df['skills'] = process_non_skills(df['skills'])

df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20372 entries, 0 to 21999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   company_name   20323 non-null  object
 1   description    20372 non-null  object
 2   location_name  20369 non-null  object
 3   listing_title  20372 non-null  object
 4   skills         20338 non-null  object
dtypes: object(5)
memory usage: 954.9+ KB


Unnamed: 0,company_name,description,location_name,listing_title,skills
0,"Digital Intelligence Systems, LLC",Looking for Selenium engineers...must have sol...,"Atlanta, GA",AUTOMATION TEST ENGINEER,
1,University of Chicago/IT Services,The University of Chicago has a rapidly growin...,"Chicago, IL",Information Security Engineer,"linux unix, network monitoring, incident respo..."
2,"Galaxy Systems, Inc.","GalaxE.SolutionsEvery day, our solutions affec...","Schaumburg, IL",Business Solutions Architect,"Enterprise Solutions Architecture, business in..."


### Clean `Job Descriptions` only for rows that doesn't have any valid skills

In [None]:
def clean_job_description(string):
    '''
    remove everything but letters, numbers, commas, +, #, and regular punctuation
    '''
    
    if type(string)!=str:
      return np.nan
    string = string.replace('•','. ') \
                   .replace('\n','. ') \
                   .replace('...', '. ') \
                   .replace('\xa0','. ') \
                   .replace('\t', ' ')
    string = string.replace('  ', ' ')
    string = re.sub('[^a-zA-Z0-9,.?!+# ]+', '', string)
    return string

In [None]:
'''clean only job descriptions that don't have valid skills'''
df.description = df.apply(lambda row: clean_job_description(row.description) if row['skills']=='' else row.description, axis=1)

df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20372 entries, 0 to 21999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   company_name   20323 non-null  object
 1   description    20372 non-null  object
 2   location_name  20369 non-null  object
 3   listing_title  20372 non-null  object
 4   skills         20338 non-null  object
dtypes: object(5)
memory usage: 954.9+ KB


Unnamed: 0,company_name,description,location_name,listing_title,skills
0,"Digital Intelligence Systems, LLC",Looking for Selenium engineers. must have soli...,"Atlanta, GA",AUTOMATION TEST ENGINEER,
1,University of Chicago/IT Services,The University of Chicago has a rapidly growin...,"Chicago, IL",Information Security Engineer,"linux unix, network monitoring, incident respo..."
2,"Galaxy Systems, Inc.","GalaxE.SolutionsEvery day, our solutions affec...","Schaumburg, IL",Business Solutions Architect,"Enterprise Solutions Architecture, business in..."


# Format the `skills` column how we want it

### Convert `skills` string to list

In [None]:
def skills_to_list(df: pd.DataFrame):
    '''
    split lists along commas
    '''
    
    df = df.dropna()
    df['skills'] = df['skills'].apply(lambda x: x.split(',') if x!='' else [])
    return df

In [None]:
df = skills_to_list(df)

df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20286 entries, 0 to 21999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   company_name   20286 non-null  object
 1   description    20286 non-null  object
 2   location_name  20286 non-null  object
 3   listing_title  20286 non-null  object
 4   skills         20286 non-null  object
dtypes: object(5)
memory usage: 950.9+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['skills'] = df['skills'].apply(lambda x: x.split(',') if x!='' else [])


Unnamed: 0,company_name,description,location_name,listing_title,skills
0,"Digital Intelligence Systems, LLC",Looking for Selenium engineers. must have soli...,"Atlanta, GA",AUTOMATION TEST ENGINEER,[]
1,University of Chicago/IT Services,The University of Chicago has a rapidly growin...,"Chicago, IL",Information Security Engineer,"[linux unix, network monitoring, incident re..."
2,"Galaxy Systems, Inc.","GalaxE.SolutionsEvery day, our solutions affec...","Schaumburg, IL",Business Solutions Architect,"[Enterprise Solutions Architecture, business ..."


### Clean `skills` as lists

Use NER to extract skills from longer sentences that were incorrectly save within the `skills` lists

In [None]:
def clean_list(row, threshold: int=6):
    '''
    strip whitespaces and get rid of empty entries
    also get rid of "skills" that are really just sentences, but extract ORG and PRODUCT from them first
    '''
    
    row = [r.strip() for r in row if r]
    
    '''extract ORG and PRODUCT from sentences'''
    one = [extract_entities_stanza(r) for r in row if len(r.split()) >= threshold] #using stanza
    two = [extract_entities_spacy(r) for r in row if len(r.split()) >= threshold] #using spacy
    '''combine with existing list of skills (without duplicates)'''
    x = one + two
    row = list(set(sum(x, row)))
    
    row = [r for r in row if (len(r.split()) < threshold)] #get rid of sentences in the list
    return row

In [None]:
def post_ner_fix(row):
    '''
    get rid of "travel" and "us government" skills that are leftover from improperly selected skills from using NER
    (things like "50 tavel" and "US Government Secret")
    '''
    row = [x for x in row if ("travel" not in x.lower() and "us government" not in x.lower())]
    return row

In [None]:
%%time
df['skills'] = df['skills'].apply(clean_list)
df['skills'] = df['skills'].apply(post_ner_fix)

df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20286 entries, 0 to 21999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   company_name   20286 non-null  object
 1   description    20286 non-null  object
 2   location_name  20286 non-null  object
 3   listing_title  20286 non-null  object
 4   skills         20286 non-null  object
dtypes: object(5)
memory usage: 950.9+ KB
CPU times: user 16min 41s, sys: 2.49 s, total: 16min 43s
Wall time: 16min 51s


Unnamed: 0,company_name,description,location_name,listing_title,skills
0,"Digital Intelligence Systems, LLC",Looking for Selenium engineers. must have soli...,"Atlanta, GA",AUTOMATION TEST ENGINEER,[]
1,University of Chicago/IT Services,The University of Chicago has a rapidly growin...,"Chicago, IL",Information Security Engineer,"[systems administration, network monitoring, i..."
2,"Galaxy Systems, Inc.","GalaxE.SolutionsEvery day, our solutions affec...","Schaumburg, IL",Business Solutions Architect,"[business inteligence, reporting, Enterprise S..."


### Apply NER to extract skills from Job Description column, but only if we are missing them in Skills column

In [None]:
def extend_lists(df):
    '''
    This function will get the main dataframe and will combine lists of skills 
    from different columns, will remove duplicates and then will produce a final skills list.
    '''
    
    one  = df['description_stanza']
    two = df['description_spacy']
    three = df['skills']
    
    result = one + two + three

    # lowercase all skills
    result = [x.lower() for x in result]
    result = list(set(result))
    
    return result

In [None]:
%%time

'''extract with stanza'''
df['description_stanza'] = np.empty((len(df), 0)).tolist()
df['description_stanza'] = df.apply(lambda row: extract_entities_stanza(row.description) if row['skills']==[] else row.description_stanza, axis=1)

'''extract with spacy'''
df['description_spacy'] = np.empty((len(df), 0)).tolist()
df['description_spacy'] = df.apply(lambda row: extract_entities_spacy(row.description) if row['skills']==[] else row.description_stanza, axis=1)

'''combine everything together and remove duplicate skills'''
df['listing_skill_name'] = df.apply(extend_lists, axis=1)

df

CPU times: user 1h 16min 50s, sys: 5min 46s, total: 1h 22min 37s
Wall time: 1h 23min 1s


Unnamed: 0,company_name,description,location_name,listing_title,skills,description_stanza,description_spacy,listing_skill_name
0,"Digital Intelligence Systems, LLC",Looking for Selenium engineers. must have soli...,"Atlanta, GA",AUTOMATION TEST ENGINEER,[],"[Siebel, Websphere, Java Development, Unix, KS...","[SOAP, a Software Engineer in Test, KSH, Websp...","[ecommerceretail qa, lan, peoplesoft, bourne s..."
1,University of Chicago/IT Services,The University of Chicago has a rapidly growin...,"Chicago, IL",Information Security Engineer,"[systems administration, network monitoring, i...",[],[],"[systems administration, network monitoring, i..."
2,"Galaxy Systems, Inc.","GalaxE.SolutionsEvery day, our solutions affec...","Schaumburg, IL",Business Solutions Architect,"[business inteligence, reporting, Enterprise S...",[],[],"[business inteligence, enterprise solutions ar..."
3,TransTech LLC,"Java DeveloperFulltimedirecthireBolingbrook, I...","Bolingbrook, IL","Java Developer (mid level)- FT- GREAT culture,...",[],"[Unix, Java DeveloperFulltimedirecthireBolingb...","[OverviewThe, NonTechnical Requirements, SQL S...","[selfdirected, oracle, skills c, mysql, unix, ..."
4,Matrix Resources,Midtown based high tech firm has an immediate ...,"Atlanta, GA",DevOps Engineer,"[Developer, VMware, Linux, Management, Process...",[],[],"[developer, process engineering, linux, config..."
...,...,...,...,...,...,...,...,...
21995,IAC Publishing,Company Description We are searching for a ta...,"Oakland, CA",Web Designer,"[interaction design, fine arts, Sketch, digita...",[],[],"[ui design, interaction design, fine arts, dig..."
21996,Omega Solutions Inc,CONTACT - priya@omegasolutioninc.com / 408-45...,"San Francisco, CA",Senior Front End Web Developer - Full Time at ...,"[Backbone, HTML5, JavaScript, CSS3, Bootstrap,...",[],[],"[angular, javascript, ajax, bootstrap, css3, r..."
21997,San Francisco Health Plan,Do you take pride in your work knowing that th...,"San Francisco, CA",QA Analyst,"[ALM, ASP, Visual Studio, Team Foundation Serv...",[],[],"[team foundation server, uat, visual studio, s..."
21998,IAC Publishing,Company Description What We Can Offer YouAs th...,"Oakland, CA",Tech Lead-Full Stack,"[Python, NoSQLDatabase, JaveScript, Node, Cloj...",[],[],"[java, ruby, go, sql, python, nosqldatabase, c..."


In [None]:
############### Sergey: this removes some good stuff too, like [unix] ##################################

def remove_empty_skills(df: pd.DataFrame):
    '''
    get rid of all skill-lists that are 1 item long
    '''
    
    df['skills_count'] = df['listing_skill_name'].apply(lambda x: len(x))
    df = df[df['skills_count']>1] \
          .drop(columns=['skills_count'])
    return df

In [None]:
'''last bit of pruning empty skillsets (and skillsets of size 1)'''
df = remove_empty_skills(df)

'''drop rows with empty listing_skill_name (empty list or null)'''
df = df[df['listing_skill_name'].astype(bool)] \
       .dropna(subset=['listing_skill_name']) \
       .reset_index(drop=True).reset_index() \
       .rename(columns={'index':'listing_id:ID'})

df

Unnamed: 0,listing_id:ID,company_name,description,location_name,listing_title,skills,description_stanza,description_spacy,listing_skill_name
0,0,"Digital Intelligence Systems, LLC",Looking for Selenium engineers. must have soli...,"Atlanta, GA",AUTOMATION TEST ENGINEER,[],"[Siebel, Websphere, Java Development, Unix, KS...","[SOAP, a Software Engineer in Test, KSH, Websp...","[ecommerceretail qa, lan, peoplesoft, bourne s..."
1,1,University of Chicago/IT Services,The University of Chicago has a rapidly growin...,"Chicago, IL",Information Security Engineer,"[systems administration, network monitoring, i...",[],[],"[systems administration, network monitoring, i..."
2,2,"Galaxy Systems, Inc.","GalaxE.SolutionsEvery day, our solutions affec...","Schaumburg, IL",Business Solutions Architect,"[business inteligence, reporting, Enterprise S...",[],[],"[business inteligence, enterprise solutions ar..."
3,3,TransTech LLC,"Java DeveloperFulltimedirecthireBolingbrook, I...","Bolingbrook, IL","Java Developer (mid level)- FT- GREAT culture,...",[],"[Unix, Java DeveloperFulltimedirecthireBolingb...","[OverviewThe, NonTechnical Requirements, SQL S...","[selfdirected, oracle, skills c, mysql, unix, ..."
4,4,Matrix Resources,Midtown based high tech firm has an immediate ...,"Atlanta, GA",DevOps Engineer,"[Developer, VMware, Linux, Management, Process...",[],[],"[developer, process engineering, linux, config..."
...,...,...,...,...,...,...,...,...,...
16263,16263,IAC Publishing,Company Description We are searching for a ta...,"Oakland, CA",Web Designer,"[interaction design, fine arts, Sketch, digita...",[],[],"[ui design, interaction design, fine arts, dig..."
16264,16264,Omega Solutions Inc,CONTACT - priya@omegasolutioninc.com / 408-45...,"San Francisco, CA",Senior Front End Web Developer - Full Time at ...,"[Backbone, HTML5, JavaScript, CSS3, Bootstrap,...",[],[],"[angular, javascript, ajax, bootstrap, css3, r..."
16265,16265,San Francisco Health Plan,Do you take pride in your work knowing that th...,"San Francisco, CA",QA Analyst,"[ALM, ASP, Visual Studio, Team Foundation Serv...",[],[],"[team foundation server, uat, visual studio, s..."
16266,16266,IAC Publishing,Company Description What We Can Offer YouAs th...,"Oakland, CA",Tech Lead-Full Stack,"[Python, NoSQLDatabase, JaveScript, Node, Cloj...",[],[],"[java, ruby, go, sql, python, nosqldatabase, c..."


In [None]:
'''save a copy just in case'''
df[['listing_id:ID', 'company_name', 'description', 'location_name', 'listing_title', 'listing_skill_name']] \
  .to_csv(temp_dir+"templistingdataframe.csv", index=False)

# Saving nodes and relations as CSVs for Neo4j

In [None]:
'''user only has to run this block if they don't want to run the above cells but need to modify the dataframes below'''
try:
    len(df)
except:
    import pandas as pd
    from google.colab import drive
    drive.mount('/content/drive')
    mydrive = "/content/drive/MyDrive/DSE 203 — etl/DSE203_Project/"  # this is when we run on COLAB
    temp_dir = mydrive+"temp_datasets/"
    output_dir = mydrive+"output_datasets/"
    df = pd.read_csv(temp_dir+"templistingdataframe.csv", index_col=False)

### (node) LISTING

In [None]:
listing_df = df[['listing_id:ID','listing_title','description']].copy()
listing_df[':LABEL'] = "LISTING"
listing_df

Unnamed: 0,listing_id:ID,listing_title,description,:LABEL
0,0,AUTOMATION TEST ENGINEER,Looking for Selenium engineers. must have soli...,LISTING
1,1,Information Security Engineer,The University of Chicago has a rapidly growin...,LISTING
2,2,Business Solutions Architect,"GalaxE.SolutionsEvery day, our solutions affec...",LISTING
3,3,"Java Developer (mid level)- FT- GREAT culture,...","Java DeveloperFulltimedirecthireBolingbrook, I...",LISTING
4,4,DevOps Engineer,Midtown based high tech firm has an immediate ...,LISTING
...,...,...,...,...
16263,16263,Web Designer,Company Description We are searching for a ta...,LISTING
16264,16264,Senior Front End Web Developer - Full Time at ...,CONTACT - priya@omegasolutioninc.com / 408-45...,LISTING
16265,16265,QA Analyst,Do you take pride in your work knowing that th...,LISTING
16266,16266,Tech Lead-Full Stack,Company Description What We Can Offer YouAs th...,LISTING


In [None]:
listing_df.to_csv(output_dir+"listing__node.csv", index=False)

### (temporary node) LISTING_SKILL

In [None]:
listing_skill_df = df[['listing_skill_name']].copy() \
                     .explode('listing_skill_name') \
                     .drop_duplicates() \
                     .reset_index(drop=True).reset_index() \
                     .rename(columns={'index':'listing_skill_id'})

listing_skill_df

Unnamed: 0,listing_skill_id,listing_skill_name
0,0,ecommerceretail qa
1,1,lan
2,2,peoplesoft
3,3,bourne shell scripting
4,4,groovy
...,...,...
29418,29418,nosqldatabase
29419,29419,programmingdevelopment
29420,29420,programming on win xp788.1
29421,29421,skills win32 programming expertcc++ programming


In [None]:
listing_skill_df.to_csv(temp_dir+"listing_skills_TEMP.csv", index=False)

### (node) LOCATION

In [None]:
location_df = df[['location_name']].copy() \
                .drop_duplicates() \
                .reset_index(drop=True).reset_index() \
                .rename(columns={'index':'location_id:ID'})
location_df[':LABEL'] = "LOCATION"
location_df

Unnamed: 0,location_id:ID,location_name,:LABEL
0,0,"Atlanta, GA",LOCATION
1,1,"Chicago, IL",LOCATION
2,2,"Schaumburg, IL",LOCATION
3,3,"Bolingbrook, IL",LOCATION
4,4,"New York, NY",LOCATION
...,...,...,...
1399,1399,"San Francisco,",LOCATION
1400,1400,"Saratoga, CA",LOCATION
1401,1401,"Tiburon, CA",LOCATION
1402,1402,"Gold River, CA",LOCATION


In [None]:
location_df.to_csv(output_dir+"location__node.csv", index=False)

### (node) COMPANY

In [None]:
company_df = df[['company_name']].copy() \
                .drop_duplicates() \
                .reset_index(drop=True).reset_index() \
                .rename(columns={'index':'company_id:ID'})
company_df[':LABEL'] = "COMPANY"
company_df

Unnamed: 0,company_id:ID,company_name,:LABEL
0,0,"Digital Intelligence Systems, LLC",COMPANY
1,1,University of Chicago/IT Services,COMPANY
2,2,"Galaxy Systems, Inc.",COMPANY
3,3,TransTech LLC,COMPANY
4,4,Matrix Resources,COMPANY
...,...,...,...
3812,3812,VirtuStream,COMPANY
3813,3813,"Quisk, Inc.",COMPANY
3814,3814,Compusharp Inc.,COMPANY
3815,3815,Bracket Global,COMPANY


In [None]:
company_df.to_csv(output_dir+"company__node.csv", index=False)

### [relation] NEEDS

In [None]:
needs_df = df[['listing_id:ID', 'listing_skill_name']].copy()
needs_df = needs_df.explode('listing_skill_name') \
                   .dropna() \
                   .merge(listing_skill_df, on='listing_skill_name') \
                   .drop(columns=['listing_skill_name']) \
                   .rename(columns={'listing_id:ID':':START_ID', 'listing_skill_id':':END_ID'})
needs_df[':TYPE'] = "NEEDS"
needs_df

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,0,0,NEEDS
1,0,1,NEEDS
2,299,1,NEEDS
3,310,1,NEEDS
4,491,1,NEEDS
...,...,...,...
120478,16266,29418,NEEDS
120479,16267,29419,NEEDS
120480,16267,29420,NEEDS
120481,16267,29421,NEEDS


In [None]:
needs_df.to_csv(output_dir+"needs__relation.csv", index=False)

### [relation] LOCATED_IN

In [None]:
located_in_df = df[['listing_id:ID', 'location_name']].copy()
located_in_df = located_in_df.dropna() \
                   .merge(location_df, on='location_name') \
                   .drop(columns=['location_name', ':LABEL']) \
                   .rename(columns={'listing_id:ID':':START_ID', 'location_id:ID':':END_ID'})
located_in_df[':TYPE'] = "LOCATED_IN"
located_in_df

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,0,0,LOCATED_IN
1,4,0,LOCATED_IN
2,6,0,LOCATED_IN
3,26,0,LOCATED_IN
4,4646,0,LOCATED_IN
...,...,...,...
16263,15448,1399,LOCATED_IN
16264,15533,1400,LOCATED_IN
16265,15742,1401,LOCATED_IN
16266,15839,1402,LOCATED_IN


In [None]:
located_in_df.to_csv(output_dir+"located_in__relation.csv", index=False)

### [relation] POSTED

In [None]:
posted_df = df[['listing_id:ID', 'company_name']].copy()
posted_df = posted_df.dropna() \
                   .merge(company_df, on='company_name') \
                   .drop(columns=['company_name', ':LABEL']) \
                   .rename(columns={'company_id:ID':':START_ID', 'listing_id:ID':':END_ID'})
posted_df[':TYPE'] = "POSTED"
posted_df

Unnamed: 0,:END_ID,:START_ID,:TYPE
0,0,0,POSTED
1,283,0,POSTED
2,655,0,POSTED
3,1021,0,POSTED
4,2055,0,POSTED
...,...,...,...
16263,16211,3812,POSTED
16264,16235,3813,POSTED
16265,16239,3814,POSTED
16266,16249,3815,POSTED


In [None]:
posted_df.to_csv(output_dir+"posted__relation.csv", index=False)