# Notebook 10: Take All NODES and RELATIONS Files and Update :ID fields to Be Able to Export to NEO4J

#### This notebook produces the following data into the _final_neo4j_files_ folder:
```
(OCCUPATION) NODE					occupation__node.csv
occupation_id:ID
occupation_title
occupation_synonyms
occupation_description
occupation_salary
:LABEL = "OCCUPATION"

[BELONGS_TO] RELATION					matches__relation.csv
:START_ID = listing_id
:END_ID =  occupation_id
:TYPE = "BELONGS_TO"

(LISTING) NODE						listing__node.csv
listing_id:ID
listing_title
description
:LABEL = "LISTING"

[NEEDS] RELATION					needs__relation.csv
:START_ID = listing_id
:END_ID = skill_id
:TYPE = "NEEDS"

(SKILL) NODE						skill__node.csv
skill_id:ID
skill_name
aliases[]
:LABEL = "SKILL"

[TEACHES] RELATION					teaches__relation.csv
:START_ID = course_id
:END_ID = skill_id
:TYPE = "TEACHES"

(COURSE) NODE						course__node.csv
course_id:ID
course_name
course_difficulty_level
course_url
:LABEL = "COURSE”

[LOCATED_IN] RELATION					located_in__relation.csv
:START_ID = listing_id
:END_ID = location_id
:TYPE = "LOCATED_IN"

(LOCATION) NODE						location__node.csv
location_id:ID
location_name
:LABEL = "LOCATION"

[POSTED] RELATION					posted__relation.csv
:START_ID = company_id
:END_ID = listing_id
:TYPE = "POSTED"

(COMPANY) NODE						company__node.csv
company_id:ID
company_name
:LABEL = "COMPANY"


[HAS_FUTURE] RELATION					has_future__relation.csv
:START_ID = company_id
:END_ID = career_outlook_id
:TYPE = "HAS_FUTURE"



(CAREER_OUTLOOK) NODE					career_outlook__node.csv
career_outlook_id:ID
career_outlook
:LABEL = "CAREER_OUTLOOK"

```

In [1]:
import pandas as pd
import ast
from pathlib import Path

In [2]:
# this cell is to support running the notebook in Google Colab

mydrive = ""  # this is when we run locally

# Google Colab:
# from google.colab import drive
# drive.mount('/content/drive')
# mydrive = "/content/drive/MyDrive/DSE 203 — etl/DSE203_Project/"  # this is when we run on COLAB Leslie
# mydrive = "/content/drive/MyDrive/DSE203_Project/"  # this is when we run on COLAB Sergey

Path("final_neo4j_files").mkdir(parents=True, exist_ok=True)

input_dir = mydrive+"input_datasets/"
output_dir = mydrive+"output_datasets/"
temp_dir = mydrive+"temp_datasets/"
final_neo4j_dir = mydrive+"final_neo4j_files/"

## Prepare (COURSE)->[TEACHES]->(SKILL)

In [3]:
course_df = pd.read_csv(output_dir+'course__node.csv')
skill_df = pd.read_csv(output_dir+'skill__node.csv')
teaches_df = pd.read_csv(output_dir+'teaches__relation.csv')
course_df.head(10)

Unnamed: 0,course_id:ID,course_name,course_difficulty_level,course_url,:LABEL
0,0,Python and Statistics for Financial Analysis,Advanced,https://www.coursera.org/learn/python-statisti...,COURSE
1,1,Parallel programming,Beginner,https://www.coursera.org/learn/parprog1,COURSE
2,2,Getting Started with Go,Intermediate,https://www.coursera.org/learn/golang-getting-...,COURSE
3,3,TensorFlow for CNNs Transfer Learning,Beginner,https://www.coursera.org/learn/tensorflow-for-...,COURSE
4,4,Image Classification with CNNs using Keras,Beginner,https://www.coursera.org/learn/image-classific...,COURSE
5,5,Create your first test automation script Sele...,Beginner,https://www.coursera.org/learn/create-first-te...,COURSE
6,6,Behavior Driven Development with Selenium and ...,Intermediate,https://www.coursera.org/learn/behavior-driven...,COURSE
7,7,Building Test Automation Framework using Selen...,Beginner,https://www.coursera.org/learn/building-test-a...,COURSE
8,8,Advanced TestNG Framework and Integration with...,Beginner,https://www.coursera.org/learn/Advanced-testng...,COURSE
9,9,Automate an ecommerce web application using Se...,Beginner,https://www.coursera.org/learn/automate-e-comm...,COURSE


In [4]:
skill_df.head(3)

Unnamed: 0,skill_id:ID,skill_name,aliases[],:LABEL
0,0,salesforce,salesforce,SKILL
1,1,mobilesdk,mobilesdk,SKILL
2,2,salesforce1 platform and lightning components,salesforce1 platform and lightning components,SKILL


In [5]:
teaches_df.head(3)

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,0,18,TEACHES
1,3,18,TEACHES
2,4,18,TEACHES


#### We have to increment SKILL IDs, so they don't overlap with COURSE IDs and also update relations

In [6]:
last_node_course = course_df['course_id:ID'].max()
next_node_skill = last_node_course + 1
next_node_skill

20

In [7]:
# update node ids of skills in (SKILL)
skill_df['skill_id:ID'] = skill_df['skill_id:ID'] + next_node_skill
skill_df

Unnamed: 0,skill_id:ID,skill_name,aliases[],:LABEL
0,20,salesforce,salesforce,SKILL
1,21,mobilesdk,mobilesdk,SKILL
2,22,salesforce1 platform and lightning components,salesforce1 platform and lightning components,SKILL
3,23,lightning connect,lightning connect,SKILL
4,24,apex,apex,SKILL
...,...,...,...,...
164,184,reactjs,reactjs,SKILL
165,185,tdd testdriven,tdd testdriven,SKILL
166,186,mvcdesign,mvcdesign,SKILL
167,187,pdf resume pdfwe,pdf resume pdfwe,SKILL


In [8]:
# update node ids of skills in [TEACHES]
teaches_df[":END_ID"] = teaches_df[":END_ID"] + next_node_skill
teaches_df.tail(10)

Unnamed: 0,:START_ID,:END_ID,:TYPE
4,18,138,TEACHES
5,2,43,TEACHES
6,5,170,TEACHES
7,7,170,TEACHES
8,5,57,TEACHES
9,7,57,TEACHES
10,8,57,TEACHES
11,9,57,TEACHES
12,9,40,TEACHES
13,14,51,TEACHES


In [9]:
# save updated versions
course_df.sort_values('course_id:ID').to_csv(final_neo4j_dir+'course__node.csv', index=False)
teaches_df.sort_values(':START_ID').drop_duplicates().to_csv(final_neo4j_dir+'teaches__relation.csv', index=False)
skill_df.sort_values('skill_id:ID').to_csv(final_neo4j_dir+'skill__node.csv', index=False)

In [10]:
last_node_skill = skill_df['skill_id:ID'].max()
next_node_listing = last_node_skill + 1
next_node_listing

189

## Prepare (LISTING)->[NEEDS]->(SKILL)

In [11]:
listing_df = pd.read_csv(output_dir+'listing__node.csv')
needs_df = pd.read_csv(output_dir+'needs__relation.csv')
listing_df.head(10)

Unnamed: 0,listing_id:ID,listing_title,description,:LABEL
0,0,Salesforce Developer,"Location: Indianapolis, INDo not send fake pro...",LISTING
1,1,Applications Manager,We're conducting a search for an Applications ...,LISTING
2,2,Sr. Java Developer,OverviewWe are seeking a Senior Java Developer...,LISTING
3,3,Cloud Developer,Experience building scalable mobile applicatio...,LISTING
4,4,"Buy Side -Web Developer - Javascript, HTML",Description:Investment Management - Web Develo...,LISTING
5,5,SQL / MySQL Developer,A well-known e-commerce company is in need of ...,LISTING
6,6,"PeopleSoft Testing Manager//San Francisco, CA.",PeopleSoft Test Manager opportunity!! Start: ...,LISTING
7,7,IT System Administrator,Please submit resume to vishakha AT PTSOL dot...,LISTING
8,8,"Business Analyst, IT",Business AnalystThis is an experience business...,LISTING
9,9,AUTOMATION TEST ENGINEER,Looking for Selenium engineers. must have soli...,LISTING


In [12]:
needs_df.head(10)

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,0,0,NEEDS
1,0,1,NEEDS
2,0,2,NEEDS
3,0,3,NEEDS
4,0,4,NEEDS
5,0,5,NEEDS
6,1,6,NEEDS
7,1,7,NEEDS
8,1,8,NEEDS
9,2,9,NEEDS


In [13]:
listing_df['listing_id:ID'] = listing_df['listing_id:ID'] + next_node_listing
listing_df.head(10)

Unnamed: 0,listing_id:ID,listing_title,description,:LABEL
0,189,Salesforce Developer,"Location: Indianapolis, INDo not send fake pro...",LISTING
1,190,Applications Manager,We're conducting a search for an Applications ...,LISTING
2,191,Sr. Java Developer,OverviewWe are seeking a Senior Java Developer...,LISTING
3,192,Cloud Developer,Experience building scalable mobile applicatio...,LISTING
4,193,"Buy Side -Web Developer - Javascript, HTML",Description:Investment Management - Web Develo...,LISTING
5,194,SQL / MySQL Developer,A well-known e-commerce company is in need of ...,LISTING
6,195,"PeopleSoft Testing Manager//San Francisco, CA.",PeopleSoft Test Manager opportunity!! Start: ...,LISTING
7,196,IT System Administrator,Please submit resume to vishakha AT PTSOL dot...,LISTING
8,197,"Business Analyst, IT",Business AnalystThis is an experience business...,LISTING
9,198,AUTOMATION TEST ENGINEER,Looking for Selenium engineers. must have soli...,LISTING


In [14]:
needs_df[':START_ID'] = needs_df[':START_ID'] + next_node_listing
needs_df[':END_ID'] = needs_df[':END_ID'] + next_node_skill
needs_df

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,189,20,NEEDS
1,189,21,NEEDS
2,189,22,NEEDS
3,189,23,NEEDS
4,189,24,NEEDS
...,...,...,...
177,202,184,NEEDS
178,202,185,NEEDS
179,202,186,NEEDS
180,202,187,NEEDS


In [15]:
listing_df.sort_values('listing_id:ID').to_csv(final_neo4j_dir+'listing__node.csv', index=False)
needs_df.sort_values(':START_ID').drop_duplicates().to_csv(final_neo4j_dir+'needs__relation.csv', index=False)

In [16]:
last_node_listing = listing_df['listing_id:ID'].max()
next_node_location = last_node_listing + 1
next_node_location

203

## Prepare (LISTING)->[LOCATED_IN]->(LOCATION)

In [17]:
location_df = pd.read_csv(output_dir+'location__node.csv')
located_in_df = pd.read_csv(output_dir+'located_in__relation.csv')
location_df.head(10)

Unnamed: 0,location_id:ID,location_name,:LABEL
0,0,"Indianapolis, IN",LOCATION
1,1,"Bellevue, WA",LOCATION
2,2,"Coconut Creek, FL",LOCATION
3,3,"Hartford, CT",LOCATION
4,4,"New York, NY",LOCATION
5,5,"Seattle, WA",LOCATION
6,6,"San Francisco, CA",LOCATION
7,7,"Woodland Hills, CA",LOCATION
8,8,"Mount Laurel, NJ",LOCATION
9,9,"Atlanta, GA",LOCATION


In [18]:
located_in_df.head(10)

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,0,0,LOCATED_IN
1,1,1,LOCATED_IN
2,2,2,LOCATED_IN
3,3,3,LOCATED_IN
4,4,4,LOCATED_IN
5,13,4,LOCATED_IN
6,5,5,LOCATED_IN
7,6,6,LOCATED_IN
8,7,7,LOCATED_IN
9,8,8,LOCATED_IN


In [19]:
location_df['location_id:ID'] = location_df['location_id:ID'] + next_node_location
location_df.head(10)

Unnamed: 0,location_id:ID,location_name,:LABEL
0,203,"Indianapolis, IN",LOCATION
1,204,"Bellevue, WA",LOCATION
2,205,"Coconut Creek, FL",LOCATION
3,206,"Hartford, CT",LOCATION
4,207,"New York, NY",LOCATION
5,208,"Seattle, WA",LOCATION
6,209,"San Francisco, CA",LOCATION
7,210,"Woodland Hills, CA",LOCATION
8,211,"Mount Laurel, NJ",LOCATION
9,212,"Atlanta, GA",LOCATION


In [20]:
located_in_df[':START_ID'] = located_in_df[':START_ID'] + next_node_listing
located_in_df[':END_ID'] = located_in_df[':END_ID'] + next_node_location
located_in_df

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,189,203,LOCATED_IN
1,190,204,LOCATED_IN
2,191,205,LOCATED_IN
3,192,206,LOCATED_IN
4,193,207,LOCATED_IN
5,202,207,LOCATED_IN
6,194,208,LOCATED_IN
7,195,209,LOCATED_IN
8,196,210,LOCATED_IN
9,197,211,LOCATED_IN


In [21]:
location_df.sort_values('location_id:ID').to_csv(final_neo4j_dir+'location__node.csv', index=False)
located_in_df.sort_values(':START_ID').drop_duplicates().to_csv(final_neo4j_dir+'located_in__relation.csv', index=False)

In [22]:
last_node_location = location_df['location_id:ID'].max()
next_node_company = last_node_location + 1
next_node_company

216

## Prepare (COMPANY)->[POSTED]->(LISTING)

In [23]:
company_df = pd.read_csv(output_dir+'company__node.csv')
posted_df = pd.read_csv(output_dir+'posted__relation.csv')
company_df.head(10)

Unnamed: 0,company_id:ID,company_name,:LABEL
0,0,Naztec International Group LLC,COMPANY
1,1,MACRO.CCS,COMPANY
2,2,PROTECH,COMPANY
3,3,IT People Corporation,COMPANY
4,4,Analytic Recruiting Inc,COMPANY
5,5,"24 Seven, Inc.",COMPANY
6,6,"Lodestar Consulting, LLC",COMPANY
7,7,Progressive Technology Solutions,COMPANY
8,8,"firstPRO, Inc.",COMPANY
9,9,"Digital Intelligence Systems, LLC",COMPANY


In [24]:
posted_df = posted_df[[':START_ID', ':END_ID', ':TYPE']]
posted_df.head(10)

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,0,0,POSTED
1,1,1,POSTED
2,2,2,POSTED
3,3,3,POSTED
4,4,4,POSTED
5,5,5,POSTED
6,6,6,POSTED
7,7,7,POSTED
8,8,8,POSTED
9,9,9,POSTED


In [25]:
company_df['company_id:ID'] = company_df['company_id:ID'] + next_node_company
company_df.head(10)

Unnamed: 0,company_id:ID,company_name,:LABEL
0,216,Naztec International Group LLC,COMPANY
1,217,MACRO.CCS,COMPANY
2,218,PROTECH,COMPANY
3,219,IT People Corporation,COMPANY
4,220,Analytic Recruiting Inc,COMPANY
5,221,"24 Seven, Inc.",COMPANY
6,222,"Lodestar Consulting, LLC",COMPANY
7,223,Progressive Technology Solutions,COMPANY
8,224,"firstPRO, Inc.",COMPANY
9,225,"Digital Intelligence Systems, LLC",COMPANY


In [26]:
posted_df[':START_ID'] = posted_df[':START_ID'] + next_node_company
posted_df[':END_ID'] = posted_df[':END_ID'] + next_node_listing
posted_df

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,216,189,POSTED
1,217,190,POSTED
2,218,191,POSTED
3,219,192,POSTED
4,220,193,POSTED
5,221,194,POSTED
6,222,195,POSTED
7,223,196,POSTED
8,224,197,POSTED
9,225,198,POSTED


In [27]:
company_df.sort_values('company_id:ID').to_csv(final_neo4j_dir+'company__node.csv', index=False)
posted_df.sort_values(':START_ID').drop_duplicates().to_csv(final_neo4j_dir+'posted__relation.csv', index=False)

In [28]:
last_node_location = company_df['company_id:ID'].max()
next_node_occupation = last_node_location + 1
next_node_occupation

229

## Prepare (LISTING)->(BELONGS_TO)->[OCCUPATION]

In [29]:
occupation_df = pd.read_csv(output_dir+'occupation__node.csv')
belongs_to_df = pd.read_csv(output_dir+'belongs_to__relation.csv')
occupation_df.head(10)

Unnamed: 0,occupation_id:ID,onet_code,occupation_title,occupation_synonyms,occupation_description,occupation_salary,:LABEL
0,0,13-2011.00,Accountants and Auditors,"['Accountant', 'Accounting Officer', 'Audit Pa...","Examine, analyze, and interpret accounting rec...",77250.0,OCCUPATION
1,1,27-2011.00,Actors,"['Actor', 'Actress', 'Comedian', 'Comic', 'Com...","Play parts in stage, television, radio, video,...",,OCCUPATION
2,2,15-2011.00,Actuaries,"['Actuarial Analyst', 'Actuarial Associate', '...","Analyze statistical data, such as mortality, a...",105900.0,OCCUPATION
3,3,29-1291.00,Acupuncturists,"['Acupuncture Physician', 'Acupuncture Provide...","Diagnose, treat, and prevent disorders by stim...",60570.0,OCCUPATION
4,4,29-1141.01,Acute Care Nurses,"['Cardiac Interventional Care Nurse', 'Charge ...",Provide advanced nursing care for patients wit...,77600.0,OCCUPATION
5,5,25-2059.01,Adapted Physical Education Specialists,"['Adapted Physical Activity Specialist', 'Adap...",Provide individualized physical education inst...,61720.0,OCCUPATION
6,6,51-9191.00,Adhesive Bonding Machine Operators and Tenders,"['Coater Operator', 'Glue Line Operator', 'Glu...",Operate or tend bonding machines that use adhe...,37630.0,OCCUPATION
7,7,23-1021.00,"Administrative Law Judges, Adjudicators, and H...","['Adjudications Specialist', 'Adjudicator', 'A...",Conduct hearings to recommend or make decision...,102550.0,OCCUPATION
8,8,11-3012.00,Administrative Services Managers,"['Administrative Coordinator', 'Administrative...","Plan, direct, or coordinate one or more admini...",100170.0,OCCUPATION
9,9,25-3011.00,"Adult Basic Education, Adult Secondary Educati...",['Adult Basic Education Instructor (ABE Instru...,Teach or instruct out-of-school youths and adu...,59720.0,OCCUPATION


In [30]:
# make occupation_synonyms as a list for Neo4j
occupation_df.occupation_synonyms.fillna("no synonyms", inplace=True)

occupation_df.occupation_synonyms = occupation_df.occupation_synonyms \
                                          .str.replace('[','') \
                                          .str.replace(']','') \
                                          .str.replace("'",'') \
                                          .str.replace(", ",';') \

occupation_df.rename(columns={'occupation_synonyms': 'occupation_synonyms[]'}, inplace=True)
occupation_df

  """
  


Unnamed: 0,occupation_id:ID,onet_code,occupation_title,occupation_synonyms[],occupation_description,occupation_salary,:LABEL
0,0,13-2011.00,Accountants and Auditors,Accountant;Accounting Officer;Audit Partner;Au...,"Examine, analyze, and interpret accounting rec...",77250.0,OCCUPATION
1,1,27-2011.00,Actors,Actor;Actress;Comedian;Comic;Community Theater...,"Play parts in stage, television, radio, video,...",,OCCUPATION
2,2,15-2011.00,Actuaries,Actuarial Analyst;Actuarial Associate;Actuaria...,"Analyze statistical data, such as mortality, a...",105900.0,OCCUPATION
3,3,29-1291.00,Acupuncturists,Acupuncture Physician;Acupuncture Provider;Acu...,"Diagnose, treat, and prevent disorders by stim...",60570.0,OCCUPATION
4,4,29-1141.01,Acute Care Nurses,Cardiac Interventional Care Nurse;Charge Nurse...,Provide advanced nursing care for patients wit...,77600.0,OCCUPATION
...,...,...,...,...,...,...,...
1011,1011,51-7099.00,"Woodworkers, All Other",no synonyms,All woodworkers not listed separately.,,OCCUPATION
1012,1012,51-7042.00,"Woodworking Machine Setters, Operators, and Te...",Boring Machine Operator;Cabinet Maker;Knot Saw...,"Set up, operate, or tend woodworking machines,...",36090.0,OCCUPATION
1013,1013,43-9022.00,Word Processors and Typists,Clerk Specialist;Clerk Typist;Keyboard Special...,"Use word processor, computer, or typewriter to...",44030.0,OCCUPATION
1014,1014,27-3043.00,Writers and Authors,Advertisement Agency Copywriter (Ad Agency Cop...,"Originate and prepare written material, such a...",69510.0,OCCUPATION


In [31]:
belongs_to_df.columns = [':START_ID', ':END_ID', ':TYPE']
belongs_to_df

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,0.0,889.0,BELONGS_TO
1,1.0,11.0,BELONGS_TO
2,2.0,188.0,BELONGS_TO
3,3.0,889.0,BELONGS_TO
4,5.0,889.0,BELONGS_TO
5,7.0,243.0,BELONGS_TO
6,8.0,499.0,BELONGS_TO
7,9.0,835.0,BELONGS_TO
8,10.0,16.0,BELONGS_TO
9,11.0,117.0,BELONGS_TO


In [32]:
occupation_df['occupation_id:ID'] = occupation_df['occupation_id:ID'] + next_node_occupation
occupation_df.head(10)

Unnamed: 0,occupation_id:ID,onet_code,occupation_title,occupation_synonyms[],occupation_description,occupation_salary,:LABEL
0,229,13-2011.00,Accountants and Auditors,Accountant;Accounting Officer;Audit Partner;Au...,"Examine, analyze, and interpret accounting rec...",77250.0,OCCUPATION
1,230,27-2011.00,Actors,Actor;Actress;Comedian;Comic;Community Theater...,"Play parts in stage, television, radio, video,...",,OCCUPATION
2,231,15-2011.00,Actuaries,Actuarial Analyst;Actuarial Associate;Actuaria...,"Analyze statistical data, such as mortality, a...",105900.0,OCCUPATION
3,232,29-1291.00,Acupuncturists,Acupuncture Physician;Acupuncture Provider;Acu...,"Diagnose, treat, and prevent disorders by stim...",60570.0,OCCUPATION
4,233,29-1141.01,Acute Care Nurses,Cardiac Interventional Care Nurse;Charge Nurse...,Provide advanced nursing care for patients wit...,77600.0,OCCUPATION
5,234,25-2059.01,Adapted Physical Education Specialists,Adapted Physical Activity Specialist;Adapted P...,Provide individualized physical education inst...,61720.0,OCCUPATION
6,235,51-9191.00,Adhesive Bonding Machine Operators and Tenders,Coater Operator;Glue Line Operator;Glue Reel O...,Operate or tend bonding machines that use adhe...,37630.0,OCCUPATION
7,236,23-1021.00,"Administrative Law Judges, Adjudicators, and H...",Adjudications Specialist;Adjudicator;Administr...,Conduct hearings to recommend or make decision...,102550.0,OCCUPATION
8,237,11-3012.00,Administrative Services Managers,Administrative Coordinator;Administrative Dire...,"Plan, direct, or coordinate one or more admini...",100170.0,OCCUPATION
9,238,25-3011.00,"Adult Basic Education, Adult Secondary Educati...",Adult Basic Education Instructor (ABE Instruct...,Teach or instruct out-of-school youths and adu...,59720.0,OCCUPATION


In [33]:
belongs_to_df[':START_ID'] = belongs_to_df[':START_ID'] + next_node_listing
belongs_to_df[':END_ID'] = belongs_to_df[':END_ID'] + next_node_occupation
belongs_to_df

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,189.0,1118.0,BELONGS_TO
1,190.0,240.0,BELONGS_TO
2,191.0,417.0,BELONGS_TO
3,192.0,1118.0,BELONGS_TO
4,194.0,1118.0,BELONGS_TO
5,196.0,472.0,BELONGS_TO
6,197.0,728.0,BELONGS_TO
7,198.0,1064.0,BELONGS_TO
8,199.0,245.0,BELONGS_TO
9,200.0,346.0,BELONGS_TO


In [34]:
belongs_to_df[':START_ID'] = belongs_to_df[':START_ID'].astype(int)
belongs_to_df[':END_ID'] = belongs_to_df[':END_ID'].astype(int)
belongs_to_df

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,189,1118,BELONGS_TO
1,190,240,BELONGS_TO
2,191,417,BELONGS_TO
3,192,1118,BELONGS_TO
4,194,1118,BELONGS_TO
5,196,472,BELONGS_TO
6,197,728,BELONGS_TO
7,198,1064,BELONGS_TO
8,199,245,BELONGS_TO
9,200,346,BELONGS_TO


In [35]:
occupation_df.sort_values('occupation_id:ID').to_csv(final_neo4j_dir+'occupation__node.csv', index=False)
belongs_to_df.sort_values(':START_ID').drop_duplicates().to_csv(final_neo4j_dir+'belongs_to__relation.csv', index=False)

In [36]:
last_node_occupation = occupation_df['occupation_id:ID'].max()
next_node_career_outlook = last_node_occupation + 1
next_node_career_outlook

1245

## Prepare (OCCUPATION)->(HAS_FUTURE)->[CAREER_OUTLOOK]

In [37]:
career_outlook_df = pd.read_csv(output_dir+'career_outlook__node.csv')
has_future_df = pd.read_csv(output_dir+'has_future__relation.csv')
career_outlook_df.head(10)

Unnamed: 0,career_outlook_id:ID,career_outlook,:LABEL
0,0,Bright,CAREER_OUTLOOK
1,1,Average,CAREER_OUTLOOK
2,2,Below Average,CAREER_OUTLOOK


In [38]:
# belongs_to_df.columns = [':START_ID', ':END_ID', ':TYPE']
has_future_df = has_future_df[[':START_ID', ':END_ID', ':TYPE']]
has_future_df

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,0,0,HAS_FUTURE
1,1,0,HAS_FUTURE
2,2,0,HAS_FUTURE
3,4,0,HAS_FUTURE
4,10,0,HAS_FUTURE
...,...,...,...
918,1006,2,HAS_FUTURE
919,1008,2,HAS_FUTURE
920,1012,2,HAS_FUTURE
921,1013,2,HAS_FUTURE


In [39]:
career_outlook_df['career_outlook_id:ID'] = career_outlook_df['career_outlook_id:ID'] + next_node_career_outlook
career_outlook_df.head(10)

Unnamed: 0,career_outlook_id:ID,career_outlook,:LABEL
0,1245,Bright,CAREER_OUTLOOK
1,1246,Average,CAREER_OUTLOOK
2,1247,Below Average,CAREER_OUTLOOK


In [40]:
has_future_df[':START_ID'] = has_future_df[':START_ID'] + next_node_occupation
has_future_df[':END_ID'] = has_future_df[':END_ID'] + next_node_career_outlook
has_future_df

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,229,1245,HAS_FUTURE
1,230,1245,HAS_FUTURE
2,231,1245,HAS_FUTURE
3,233,1245,HAS_FUTURE
4,239,1245,HAS_FUTURE
...,...,...,...
918,1235,1247,HAS_FUTURE
919,1237,1247,HAS_FUTURE
920,1241,1247,HAS_FUTURE
921,1242,1247,HAS_FUTURE


In [41]:
# fix for career_outlook
career_outlook_df[':LABEL'] = 'CAREER_OUTLOOK'
career_outlook_df.sort_values('career_outlook_id:ID').to_csv(final_neo4j_dir+'career_outlook__node.csv', index=False)
has_future_df.sort_values(':START_ID').drop_duplicates().to_csv(final_neo4j_dir+'has_future__relation.csv', index=False)

In [42]:
last_node_career_outlook = career_outlook_df['career_outlook_id:ID'].max()
next_node_ = last_node_career_outlook + 1
next_node_

1248

## Export to NEO4J: