## Data Cleaning
### No need to run the data cleaning section. This section is just for indicating that we have used pre-processing on our data

#### The provided Python script identifies unique industry names from 'companies_industry.csv' absent in 'industries.csv', assigns new IDs, and adds these unique industries to 'industries.csv' and change the column "industry" to "industry_id" in 'company_industries.csv'

In [None]:
import pandas as pd


industries_df = pd.read_csv('industries.csv')
companies_industry_df = pd.read_csv('company_industries.csv')


unknown_industries = companies_industry_df[~companies_industry_df['industry'].isin(industries_df['industry_name'])]['industry']

# Count unique industry names that are not present in industries.csv
unique_unknown_industries = unknown_industries.nunique()
print(f"Number of unique industry names to be added to industries.csv: {unique_unknown_industries}")

# Create DataFrame for unique unknown industries with new IDs
new_ids = range(industries_df['industry_id'].max() + 1, industries_df['industry_id'].max() + 1 + unique_unknown_industries)
new_industries = pd.DataFrame({'industry_id': new_ids, 'industry_name': unknown_industries.unique()})


industries_df = pd.concat([industries_df, new_industries], ignore_index=True)

# Merge industries_df and companies_industry_df on 'industry' to get industry_id
result_df = pd.merge(companies_industry_df, industries_df, left_on='industry', right_on='industry_name', how='left')

# Drop redundant columns and rename 'industry_id' to 'industry_id_new'
result_df = result_df.drop(columns=['industry', 'industry_name']).rename(columns={'industry_id': 'industry_id'})

industries_df.to_csv('industries.csv', index=False)
result_df.to_csv('companies_industry.csv', index=False)


#### Performed data cleaning on 'companies.csv', including removal of dots and dashes in the 'state' column, elimination of accent characters, and consolidation of multiple spaces in the 'state' field, updating the original file.

In [None]:
import pandas as pd
import re
from unidecode import unidecode

file_path = 'companies.csv'

data_types = {'company_size': pd.Int64Dtype()}  # Force 'company_size' to integer type

data = pd.read_csv(file_path, dtype=data_types)

# Remove dots (.) and dashes (-) from the 'state' column
data['state'] = data['state'].str.replace('[.]', '', regex=True)
data['state'] = data['state'].str.replace('[-]', '', regex=True)

# Remove accent characters from the 'state' column
data['state'] = data['state'].apply(lambda x: unidecode(str(x)))

# Convert 'state' column to strings and clean the values
data['state'] = data['state'].astype(str).str.strip()

# Replace multiple spaces with a single space in the 'state' column only
data['state'] = data['state'].str.replace(r'\s+', ' ', regex=True)

data.to_csv(file_path, index=False)

print(f"Modifications complete. Updated data written to '{file_path}'.")


#### Removed unnecessary columns 'formatted_work_type' and 'scraped' in 'job_posting.csv'.

In [None]:
import pandas as pd

file_path = 'job_postings.csv'
data = pd.read_csv(file_path)

# Drop the columns 'formatted_work_type' and 'scraped'
columns_to_drop = ['formatted_work_type', 'scraped']
data.drop(columns=columns_to_drop, inplace=True)

data.to_csv(file_path, index=False)


#### The code reads a CSV file containing job postings, removes emojis from the 'description' column using regex, and saves the modified data directly back to the original file without creating a new CSV file. This process ensures the original file is updated with the modified 'description' column without duplication.

In [None]:
import pandas as pd
import regex as re

file_path = 'job_postings.csv'  
data = pd.read_csv(file_path)

# Function to remove emojis from text
def remove_emojis(text):
    if isinstance(text, str):
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # Emoticons
                               u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                               u"\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                               u"\U0001F700-\U0001F77F"  # Alchemical Symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Extended People
                               u"\U0001FA70-\U0001FAFF"  # Various Symbols
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)
    else:
        return str(text)

# Apply the function to remove emojis from the 'description' column
data['description'] = data['description'].apply(remove_emojis)

data.to_csv(file_path, index=False)


#### Modified the 'company_specialities.csv' file by replacing the 'speciality' column with 'speciality_id'.Mapped company_id to speciality_id based on extracted unique specialties.

In [None]:
import pandas as pd

companies_df = pd.read_csv('companies.csv')  


company_specialities_df = pd.read_csv('company_specialities.csv')

# Extract unique specialties and assign IDs
unique_specialties = company_specialities_df['speciality'].unique()
specialities_mapping = {specialty: idx + 1 for idx, specialty in enumerate(unique_specialties)}

specialities_df = pd.DataFrame({'speciality': list(specialities_mapping.keys()),
                                'speciality_id': list(specialities_mapping.values())})

# Update company_specialities_df by mapping speciality to speciality_id
company_specialities_df['speciality_id'] = company_specialities_df['speciality'].map(specialities_mapping)

company_specialities_df.drop('speciality', axis=1, inplace=True)

# Merge with companies_df to map company_id to speciality_id
merged_df = pd.merge(company_specialities_df, companies_df, on='company_id')

# Rearrange columns if needed
merged_df = merged_df[['company_id', 'speciality_id']]

merged_df.to_csv('company_specialities.csv', index=False)

specialities_df.to_csv('specialities.csv', index=False)


#### Add benefit_id to the benefits.csv file

In [None]:
import pandas as pd

file_path = 'benefits.csv'
data = pd.read_csv(file_path)

# Create a new column 'benefit_id' as an incremental ID starting from 1
data['benefit_id'] = range(1, len(data) + 1)

# Reorder columns to have the new 'benefit_id' as the first column (if desired)
cols = data.columns.tolist()
cols = ['benefit_id'] + [col for col in cols if col != 'benefit_id']
data = data[cols]

data.to_csv('benefit_new.csv', index=False)

#### Convert the date type of timestamp to date for better queries in Job_posting

In [None]:
import pandas as pd
from datetime import datetime


df = pd.read_csv("job_posting.csv")

# Convert the timestamp to datetime format and replace the existing column
df['listed_time'] = pd.to_datetime(df['listed_time'], unit='ms')  
df['expiry'] = pd.to_datetime(df['expiry'], unit='ms')  
df['original_listed_time'] = pd.to_datetime(df['original_listed_time'], unit='ms')  
df['closed_time'] = pd.to_datetime(df['closed_time'], unit='ms')  

# Format the 'time_recorded' column to remove the time part
df['listed_time'] = df['listed_time'].dt.strftime('%Y-%m-%d')
df['expiry'] = df['expiry'].dt.strftime('%Y-%m-%d')
df['original_listed_time'] = df['original_listed_time'].dt.strftime('%Y-%m-%d')
df['closed_time'] = df['closed_time'].dt.strftime('%Y-%m-%d')

df.to_csv("job_posting_conv_new.csv", index=False)

#### Convert the date type of timestamp to date for better queries in employee_count

In [None]:
import pandas as pd
from datetime import datetime


df = pd.read_csv("employee_counts.csv")

# Convert the timestamp to datetime format and replace the existing column
df['time_recorded'] = pd.to_datetime(df['time_recorded'], unit='s')

# Format the 'time_recorded' column to remove the time part
df['time_recorded'] = df['time_recorded'].dt.strftime('%Y-%m-%d')

df.to_csv("employee_counts_conv.csv", index=False)

# Project

## Populate the Job Posting Ontology

In [1]:
# required libraries
import pandas as pd
import os
from pathlib import Path

from rdflib import Graph, Literal, RDF, URIRef, Namespace, BNode, OWL, RDFS

from rdflib.namespace import FOAF, XSD
import datetime

In [2]:
# parameters and URLs
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())
jobPostingUrl = 'data/job_posting.csv'

companiesUrl = path + '/GraphDB/data/company_details/companies.csv'
company_industriesUrl = path + '/GraphDB/data/company_details/company_industries.csv'
company_specialitiesUrl = path + '/GraphDB/data/company_details/company_specialities.csv'
specialitiesUrl = path + '/GraphDB/data/company_details/specialities.csv'
employee_countsUrl = path + '/GraphDB/data/company_details/employee_counts.csv'
c = path + '/GraphDB/data/company_details/specialities.csv'

benefitsUrl = path + '/GraphDB/data/job_details/benefits.csv'
job_industriesUrl = path + '/GraphDB/data/job_details/job_industries.csv'
job_skillsUrl = path + '/GraphDB/data/job_details/job_skills.csv'
salariesUrl = path + '/GraphDB/data/job_details/salaries.csv'

industriesUrl = path + '/GraphDB/data/maps/industries.csv'
skillsUrl = path + '/GraphDB/data/maps/skills.csv'


# country codes
countriesURL = path + '/GraphDB/data/countries/all.csv'

# saving folder
savePath =  path + '/data/rdf/linkedinDB/' 

In [3]:
# Construct the country, Linkedin Job Posting and SKOS ontology namespaces not known by RDFlib
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
LNJP = Namespace("http://www.dei.unipd.it/database2/LinkedinJobPosting#")
SKOS = Namespace("https://www.w3.org/2009/08/skos-reference/skos-owl1-dl.rdf")

## Job Posting

In [4]:
# Load the CSV files in memory
jobPosting = pd.read_csv(jobPostingUrl, sep=',', index_col='job_id')

In [5]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("lnjp", LNJP)
g.bind("skos", SKOS)

In [6]:
for index, row in jobPosting.iterrows():
    # Create the JobPosting node with a URI
    jobPostingId = "job_" + str(index)
    JobPosting = URIRef(LNJP[jobPostingId])
    g.add((JobPosting, RDF.type, LNJP.JobPosting))

    # Add data properties
    g.add((JobPosting, LNJP.job_id, Literal(index, datatype=XSD.integer)))
    g.add((JobPosting, LNJP.title, Literal(row['title'], datatype=XSD.string)))
    g.add((JobPosting, LNJP.description, Literal(row['description'], datatype=XSD.string)))
    if not pd.isnull(row['max_salary']) and row['max_salary'] != "":
        g.add((JobPosting, LNJP.max_salary, Literal(row['max_salary'], datatype=XSD.decimal)))
    if not pd.isnull(row['med_salary']) and row['med_salary'] != "":
        g.add((JobPosting, LNJP.med_salary, Literal(row['med_salary'], datatype=XSD.decimal)))
    if not pd.isnull(row['min_salary']) and row['min_salary'] != "":
        g.add((JobPosting, LNJP.min_salary, Literal(row['min_salary'], datatype=XSD.decimal)))
    if not pd.isnull(row['pay_period']) and row['pay_period'] != "":    
        g.add((JobPosting, LNJP.pay_period, Literal(row['pay_period'], datatype=XSD.string)))
    g.add((JobPosting, LNJP.location, Literal(row['location'], datatype=XSD.string)))
    if pd.notnull(row['applies']) and str(row['applies']).strip():
        g.add((JobPosting, LNJP.applies, Literal(int(row['applies']), datatype=XSD.integer)))
    g.add((JobPosting, LNJP.original_listed_time, Literal(row['original_listed_time'], datatype=XSD.date)))
    if not pd.isnull(row['remote_allowed']) and row['remote_allowed'] != "":
        g.add((JobPosting, LNJP.remote_allowed, Literal(row['remote_allowed'], datatype=XSD.integer)))
    if not pd.isnull(row['views']) and row['views'] != "":    
        g.add((JobPosting, LNJP.views, Literal(int(row['views']), datatype=XSD.integer)))
    g.add((JobPosting, LNJP.job_posting_url, Literal(row['job_posting_url'], datatype=XSD.string)))
    if not pd.isnull(row['application_url']) and row['application_url'] != "":  
        g.add((JobPosting, LNJP.application_url, Literal(row['application_url'], datatype=XSD.string)))
    g.add((JobPosting, LNJP.application_type, Literal(row['application_type'], datatype=XSD.string)))
    g.add((JobPosting, LNJP.expiry, Literal(row['expiry'], datatype=XSD.date)))
    if not pd.isnull(row['closed_time']) and row['closed_time'] != "":
        g.add((JobPosting, LNJP.closed_time, Literal(row['closed_time'], datatype=XSD.date)))
    if not pd.isnull(row['formatted_experience_level']) and row['formatted_experience_level'] != "":    
        g.add((JobPosting, LNJP.formatted_experience_level, Literal(row['formatted_experience_level'], datatype=XSD.string)))
    if not pd.isnull(row['skills_desc']) and row['skills_desc'] != "":
        g.add((JobPosting, LNJP.skills_desc, Literal(row['skills_desc'], datatype=XSD.string)))
    g.add((JobPosting, LNJP.listed_time, Literal(row['listed_time'], datatype=XSD.date)))
    if not pd.isnull(row['posting_domain']) and row['posting_domain'] != "":
        g.add((JobPosting, LNJP.posting_domain, Literal(row['posting_domain'], datatype=XSD.string)))
    g.add((JobPosting, LNJP.sponsored, Literal(row['sponsored'], datatype=XSD.integer)))
    g.add((JobPosting, LNJP.work_type, Literal(row['work_type'], datatype=XSD.string)))
    if not pd.isnull(row['currency']) and row['currency'] != "":
        g.add((JobPosting, LNJP.currency, Literal(row['currency'], datatype=XSD.string)))
    if not pd.isnull(row['compensation_type']) and row['compensation_type'] != "":
        g.add((JobPosting, LNJP.compensation_type, Literal(row['compensation_type'], datatype=XSD.string)))

    # Add object properties
    if pd.notnull(row['company_id']) and str(row['company_id']).strip():  # Check for NaN and empty string
        Company = URIRef(LNJP["company_" + str(int(row['company_id']))])
        if Company:  # Checking if Company is not empty
            g.add((JobPosting, LNJP['hasCompany'], Company))

# Serialize the RDF graph to Turtle format
turtle_file_path = 'job_postings.ttl'

# Check if the file already exists
if os.path.exists(turtle_file_path):
    user_input = input(f"The file '{turtle_file_path}' already exists. Do you want to rewrite it? (y/n): ").lower()
    
    if user_input != 'y':
        print("Serialization not saved. Exiting.")
        # You might want to add additional logic or exit the script if the user chooses not to overwrite the file.
        exit()

# If the file doesn't exist or the user chose to overwrite, proceed with saving the serialization
print("--- saving serialization ---")
g.serialize(destination=turtle_file_path, format='turtle')

print(f"RDF data exported to {turtle_file_path}")
print("Serialization saved successfully.")


--- saving serialization ---
RDF data exported to job_postings.ttl
Serialization saved successfully.


## Skills

In [7]:

skills = pd.read_csv(skillsUrl, sep=',', index_col='skill_abr', keep_default_na=False, na_values=['_'])
assert not skills.empty, "Skills DataFrame is empty. Check the CSV file or URL."

print("Skills DataFrame read successfully.")

g = Graph()


g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("lnjp", LNJP)
g.bind("skos", SKOS)

Skills DataFrame read successfully.


In [8]:

for skill_abr, row in skills.iterrows():
    
    skillId = "skill_" + str(skill_abr)
    Skill = URIRef(LNJP[skillId])
    g.add((Skill, RDF.type, SKOS.Concept))
    g.add((Skill, RDF.type, LNJP.Skill))

    # Add data properties
    g.add((Skill, LNJP.skill_abr, Literal(skill_abr, datatype=XSD.string)))
    g.add((Skill, LNJP.skill_name, Literal(row['skill_name'], datatype=XSD.string)))
    # Add object properties
    if(row['broader_concept']):
        broaderSkillId = "skill_" + str(row['broader_concept'])
        broaderSkill = URIRef(LNJP[broaderSkillId])
        g.add((Skill, SKOS.broader, broaderSkill))
    
    if(row['narrower_concept']):
        narrowerSkillId = "skill_" + str(row['narrower_concept'])
        narrowerSkill = URIRef(LNJP[narrowerSkillId])
        g.add((Skill, SKOS.narrower, narrowerSkill))
        
    if(row['related_concept']):
        relatedSkillId = "skill_" + str(row['related_concept'])
        relatedSkill = URIRef(LNJP[relatedSkillId])
        g.add((Skill, SKOS.related, relatedSkill))
    


turtle_file_path = 'skills.ttl'


if os.path.exists(turtle_file_path):
    user_input = input(f"The file '{turtle_file_path}' already exists. Do you want to rewrite it? (y/n): ").lower()

    if user_input != 'y':
        print("Serialization not saved. Exiting.")
        exit()

print("--- saving serialization ---")
g.serialize(destination=turtle_file_path, format='turtle')

print(f"RDF data exported to {turtle_file_path}")
print("Serialization saved successfully.")


--- saving serialization ---
RDF data exported to skills.ttl
Serialization saved successfully.


## JOB_SKILLS JOIN

In [9]:

job_skills = pd.read_csv(job_skillsUrl, sep=',', index_col='job_id')
assert not job_skills.empty, "Skills DataFrame is empty. Check the CSV file or URL."


print("Skills DataFrame read successfully.")

g = Graph()


g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("lnjp", LNJP)
g.bind("skos", SKOS)

Skills DataFrame read successfully.


In [10]:

for index, row in job_skills.iterrows():
    
    jobPostingId = "job_" + str(index)
    JobPosting = URIRef(LNJP[jobPostingId])
    skillId = "skill_" + str(row['skill_abr'])
    Skill = URIRef(LNJP[skillId])
    g.add((JobPosting, LNJP['hasSkill'], Skill))

turtle_file_path = 'job_skills_join.ttl'


if os.path.exists(turtle_file_path):
    user_input = input(f"The file '{turtle_file_path}' already exists. Do you want to rewrite it? (y/n): ").lower()

    if user_input != 'y':
        print("Serialization not saved. Exiting.")
        
        exit()


print("--- saving serialization ---")
g.serialize(destination=turtle_file_path, format='turtle')

print(f"RDF data exported to {turtle_file_path}")
print("Serialization saved successfully.")

--- saving serialization ---
RDF data exported to job_skills_join.ttl
Serialization saved successfully.


## Industry

In [11]:

industries = pd.read_csv(industriesUrl, sep=',', index_col='industry_id', keep_default_na=False, na_values=['_'])
assert not industries.empty, "Industries DataFrame is empty. Check the CSV file or URL."

print("Industries DataFrame read successfully.")

g = Graph()


g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("lnjp", LNJP)
g.bind("skos", SKOS)

Industries DataFrame read successfully.


In [12]:

for index, row in industries.iterrows():
   
    industryId = "industry_" + str(index)
    Industry = URIRef(LNJP[industryId])
    g.add((Industry, RDF.type, LNJP.Industry))

    # Add data properties
    g.add((Industry, LNJP['industry_id'], Literal(index, datatype=XSD.integer)))
    if not pd.isnull(row['industry_name']) and row['industry_name'] != "":
        g.add((Industry, LNJP['industry_name'], Literal(row['industry_name'], datatype=XSD.string)))
    

turtle_file_path = 'industries.ttl'


if os.path.exists(turtle_file_path):
    user_input = input(f"The file '{turtle_file_path}' already exists. Do you want to rewrite it? (y/n): ").lower()

    if user_input != 'y':
        print("Serialization not saved. Exiting.")
        exit()

print("--- saving serialization ---")
g.serialize(destination=turtle_file_path, format='turtle')

print(f"RDF data exported to {turtle_file_path}")
print("Serialization saved successfully.")


--- saving serialization ---
RDF data exported to industries.ttl
Serialization saved successfully.


## Job_Industries join

In [13]:
job_industries = pd.read_csv(job_industriesUrl, sep=',', index_col='job_id')
assert not job_industries.empty, "Skills DataFrame is empty. Check the CSV file or URL."

print("Skills DataFrame read successfully.")
g = Graph()

g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("lnjp", LNJP)
g.bind("skos", SKOS)

Skills DataFrame read successfully.


In [14]:

for index, row in job_industries.iterrows():

    
    jobPosting_id = "job_" + str(index)
    JobPosting = URIRef(LNJP[jobPosting_id])
    Industry = URIRef(LNJP["industry_" + str(row['industry_id'])])
    g.add((JobPosting, LNJP['hasIndustryType'], Industry))

turtle_file_path = 'job_industries_join.ttl'

if os.path.exists(turtle_file_path):
    user_input = input(f"The file '{turtle_file_path}' already exists. Do you want to rewrite it? (y/n): ").lower()

    if user_input != 'y':
        print("Serialization not saved. Exiting.")
        exit()


print("--- saving serialization ---")
g.serialize(destination=turtle_file_path, format='turtle')

print(f"RDF data exported to {turtle_file_path}")
print("Serialization saved successfully.")

--- saving serialization ---
RDF data exported to job_industries_join.ttl
Serialization saved successfully.


## Company

In [15]:

companies = pd.read_csv(companiesUrl, sep=',', index_col='company_id', keep_default_na=False, na_values=['_'])
assert not companies.empty, "Companies DataFrame is empty. Check the CSV file or URL."

print("Companies DataFrame read successfully.")

g = Graph()


g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("lnjp", LNJP)
g.bind("skos", SKOS)

Companies DataFrame read successfully.


In [16]:
largeCompany = LNJP.largeCompany
company_size_property = LNJP['company_size']

BNL0 = BNode()
BNL1 = BNode()
BNL2 = BNode()
BNL3 = BNode()
BNL4 = BNode()
BNL5 = BNode()
BNL6 = BNode()
BNL7 = BNode()
BNL8 = BNode()

g.add((largeCompany, RDF.type, OWL.Class))
g.add((largeCompany, RDFS.subClassOf, LNJP.Company))

g.add((largeCompany, OWL.equivalentClass, BNL0))

g.add((BNL0, OWL.intersectionOf, BNL1))

g.add((BNL1, RDF.type, RDF.List))
g.add((BNL1, RDF.first, LNJP.Company))
g.add((BNL1, RDF.rest, BNL2))

g.add((BNL2, RDF.type, RDF.List))
g.add((BNL2, RDF.first, BNL3))
g.add((BNL2, RDF.rest, RDF.nil))

g.add((BNL3, RDF.type, OWL.Restriction))
g.add((BNL3, OWL.onProperty, company_size_property))
g.add((BNL3, OWL.someValuesFrom, BNL4))

g.add((BNL4, RDF.type, RDFS.Datatype))
g.add((BNL4, OWL.onDatatype, XSD.integer))
g.add((BNL4, OWL.withRestrictions, BNL5))

g.add((BNL5, RDF.type, RDF.List))
g.add((BNL6, RDF.type, RDF.List))
g.add((BNL7, RDF.type, RDF.List))
g.add((BNL8, RDF.type, RDF.List))

g.add((BNL5, RDF.first, BNL6))
g.add((BNL5, RDF.rest, BNL7))

g.add((BNL7, RDF.first, BNL8))
g.add((BNL7, RDF.rest, RDF.nil))

g.add((BNL6, XSD.minInclusive, Literal("6", datatype=XSD.integer)))
g.add((BNL8, XSD.maxInclusive, Literal("7", datatype=XSD.integer)))

mediumCompany = LNJP.mediumCompany
company_size_property = LNJP['company_size']

BNM0 = BNode()
BNM1 = BNode()
BNM2 = BNode()
BNM3 = BNode()
BNM4 = BNode()
BNM5 = BNode()
BNM6 = BNode()
BNM7 = BNode()
BNM8 = BNode()

g.add((mediumCompany, RDF.type, OWL.Class))
g.add((mediumCompany, RDFS.subClassOf, LNJP.Company))

g.add((mediumCompany, OWL.equivalentClass, BNM0))

g.add((BNM0, OWL.intersectionOf, BNM1))

g.add((BNM1, RDF.type, RDF.List))
g.add((BNM1, RDF.first, LNJP.Company))
g.add((BNM1, RDF.rest, BNM2))

g.add((BNM2, RDF.type, RDF.List))
g.add((BNM2, RDF.first, BNM3))
g.add((BNM2, RDF.rest, RDF.nil))

g.add((BNM3, RDF.type, OWL.Restriction))
g.add((BNM3, OWL.onProperty, company_size_property))
g.add((BNM3, OWL.someValuesFrom, BNM4))

g.add((BNM4, RDF.type, RDFS.Datatype))
g.add((BNM4, OWL.onDatatype, XSD.integer))
g.add((BNM4, OWL.withRestrictions, BNM5))

g.add((BNM5, RDF.type, RDF.List))
g.add((BNM6, RDF.type, RDF.List))
g.add((BNM7, RDF.type, RDF.List))
g.add((BNM8, RDF.type, RDF.List))

g.add((BNM5, RDF.first, BNM6))
g.add((BNM5, RDF.rest, BNM7))

g.add((BNM7, RDF.first, BNM8))
g.add((BNM7, RDF.rest, RDF.nil))

g.add((BNM6, XSD.minInclusive, Literal("3", datatype=XSD.integer)))
g.add((BNM8, XSD.maxInclusive, Literal("5", datatype=XSD.integer)))

smallCompany = LNJP.smallCompany
company_size_property = LNJP['company_size']

BNS0 = BNode()
BNS1 = BNode()
BNS2 = BNode()
BNS3 = BNode()
BNS4 = BNode()
BNS5 = BNode()
BNS6 = BNode()
BNS7 = BNode()
BNS8 = BNode()

g.add((smallCompany, RDF.type, OWL.Class))
g.add((smallCompany, RDFS.subClassOf, LNJP.Company))

g.add((smallCompany, OWL.equivalentClass, BNS0))

g.add((BNS0, OWL.intersectionOf, BNS1))

g.add((BNS1, RDF.type, RDF.List))
g.add((BNS1, RDF.first, LNJP.Company))
g.add((BNS1, RDF.rest, BNS2))

g.add((BNS2, RDF.type, RDF.List))
g.add((BNS2, RDF.first, BNS3))
g.add((BNS2, RDF.rest, RDF.nil))

g.add((BNS3, RDF.type, OWL.Restriction))
g.add((BNS3, OWL.onProperty, company_size_property))
g.add((BNS3, OWL.someValuesFrom, BNS4))

g.add((BNS4, RDF.type, RDFS.Datatype))
g.add((BNS4, OWL.onDatatype, XSD.integer))
g.add((BNS4, OWL.withRestrictions, BNS5))

g.add((BNS5, RDF.type, RDF.List))
g.add((BNS6, RDF.type, RDF.List))
g.add((BNS7, RDF.type, RDF.List))
g.add((BNS8, RDF.type, RDF.List))

g.add((BNS5, RDF.first, BNS6))
g.add((BNS5, RDF.rest, BNS7))

g.add((BNS7, RDF.first, BNS8))
g.add((BNS7, RDF.rest, RDF.nil))

g.add((BNS6, XSD.minInclusive, Literal("1", datatype=XSD.integer)))
g.add((BNS8, XSD.maxInclusive, Literal("2", datatype=XSD.integer)))

for index, row in companies.iterrows():
    
    companyId = "company_" + str(index)
    Company = URIRef(LNJP[companyId])
    g.add((Company, RDF.type, LNJP.Company))

    # Add data properties
    g.add((Company, LNJP['company_id'], Literal(int(index), datatype=XSD.integer)))
    g.add((Company, LNJP['name'], Literal(row['name'], datatype=XSD.string)))
    if not pd.isnull(row['description']) and row['description'] != "":
        g.add((Company, LNJP['description'], Literal(row['description'], datatype=XSD.string)))
    if not pd.isnull(row['company_size']) and row['company_size'] != "":
        g.add((Company, LNJP['company_size'], Literal(row['company_size'], datatype=XSD.integer)))
        if (Literal(row['company_size'], datatype=XSD.integer) >= Literal("6", datatype=XSD.integer)) and (Literal(row['company_size'], datatype=XSD.integer) <= Literal("7", datatype=XSD.integer)):
            g.add((Company,RDF.type,LNJP.largeCompany))
        if (Literal(row['company_size'], datatype=XSD.integer) >= Literal("3", datatype=XSD.integer)) and (Literal(row['company_size'], datatype=XSD.integer) <= Literal("5", datatype=XSD.integer)):
            g.add((Company,RDF.type,LNJP.mediumCompany))
        if (Literal(row['company_size'], datatype=XSD.integer) >= Literal("1", datatype=XSD.integer)) and (Literal(row['company_size'], datatype=XSD.integer) <= Literal("2", datatype=XSD.integer)):
            g.add((Company,RDF.type,LNJP.smallCompany))
    if not pd.isnull(row['state']) and row['state'] != "":    
        g.add((Company, LNJP['state'], Literal(row['state'], datatype=XSD.string)))
    g.add((Company, LNJP['city'], Literal(row['city'], datatype=XSD.string)))
    if not pd.isnull(row['zip_code']) and row['zip_code'] != "":   
        g.add((Company, LNJP['zip_code'], Literal(row['zip_code'], datatype=XSD.string)))
    if not pd.isnull(row['address']) and row['address'] != "":    
        g.add((Company, LNJP['address'], Literal(row['address'], datatype=XSD.string)))
    g.add((Company, LNJP['url'], Literal(row['url'], datatype=XSD.string)))

    # Add object properties
    Country = URIRef(CNS[row['country']])
    g.add((Company, LNJP['hasCountry'], Country))

turtle_file_path = 'companies.ttl'


if os.path.exists(turtle_file_path):
    user_input = input(f"The file '{turtle_file_path}' already exists. Do you want to rewrite it? (y/n): ").lower()

    if user_input != 'y':
        print("Serialization not saved. Exiting.")
        exit()

print("--- saving serialization ---")
g.serialize(destination=turtle_file_path, format='turtle')

print(f"RDF data exported to {turtle_file_path}")
print("Serialization saved successfully.")

--- saving serialization ---
RDF data exported to companies.ttl
Serialization saved successfully.


## Company_Industries join

In [17]:

company_industries = pd.read_csv(company_industriesUrl, sep=',', index_col='company_id', keep_default_na=False, na_values=['_'])
assert not company_industries.empty, "Skills DataFrame is empty. Check the CSV file or URL."

print("Skills DataFrame read successfully.")
g = Graph()

g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("lnjp", LNJP)
g.bind("skos", SKOS)

Skills DataFrame read successfully.


In [18]:

for index, row in company_industries.iterrows():
    company_id = "company_" + str(index)
    Company = URIRef(LNJP[company_id])
    Industry = URIRef(LNJP["industry_" + str(row['industry_id'])])
    g.add((Company, LNJP['hasIndustry'], Industry))
    

turtle_file_path = 'company_industries_join.ttl'

if os.path.exists(turtle_file_path):
    user_input = input(f"The file '{turtle_file_path}' already exists. Do you want to rewrite it? (y/n): ").lower()

    if user_input != 'y':
        print("Serialization not saved. Exiting.")
        exit()

print("--- saving serialization ---")
g.serialize(destination=turtle_file_path, format='turtle')

print(f"RDF data exported to {turtle_file_path}")
print("Serialization saved successfully.")

--- saving serialization ---
RDF data exported to company_industries_join.ttl
Serialization saved successfully.


## Person Counts

In [19]:

employee_counts = pd.read_csv(employee_countsUrl, sep=',', index_col='company_record_id')

g = Graph()

g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("lnjp", LNJP)
g.bind("skos", SKOS)

In [20]:
highFollowers = LNJP.highFollowers
follower_count_property = LNJP['follower_count']

BNH0 = BNode()
BNH1 = BNode()
BNH2 = BNode()
BNH3 = BNode()
BNH4 = BNode()
BNH5 = BNode()
BNH6 = BNode()
BNH7 = BNode()
BNH8 = BNode()

g.add((highFollowers, RDF.type, OWL.Class))
g.add((highFollowers, RDFS.subClassOf, LNJP.Record))

g.add((highFollowers, OWL.equivalentClass, BNH0))

g.add((BNH0, OWL.intersectionOf, BNH1))

g.add((BNH1, RDF.type, RDF.List))
g.add((BNH1, RDF.first, LNJP.Record))
g.add((BNH1, RDF.rest, BNH2))

g.add((BNH2, RDF.type, RDF.List))
g.add((BNH2, RDF.first, BNH3))
g.add((BNH2, RDF.rest, RDF.nil))

g.add((BNH3, RDF.type, OWL.Restriction))
g.add((BNH3, OWL.onProperty, follower_count_property))
g.add((BNH3, OWL.someValuesFrom, BNH4))

g.add((BNH4, RDF.type, RDFS.Datatype))
g.add((BNH4, OWL.onDatatype, XSD.integer))
g.add((BNH4, OWL.withRestrictions, BNH5))

g.add((BNH5, RDF.type, RDF.List))
g.add((BNH6, RDF.type, RDF.List))
g.add((BNH7, RDF.type, RDF.List))
g.add((BNH8, RDF.type, RDF.List))

g.add((BNH5, RDF.first, BNH6))
g.add((BNH5, RDF.rest, BNH7))

g.add((BNH7, RDF.first, BNH8))
g.add((BNH7, RDF.rest, RDF.nil))

g.add((BNH6, XSD.minInclusive, Literal("201083", datatype=XSD.integer)))

lowFollowers = LNJP.lowFollowers
follower_count_property = LNJP['follower_count']

BNL0 = BNode()
BNL1 = BNode()
BNL2 = BNode()
BNL3 = BNode()
BNL4 = BNode()
BNL5 = BNode()
BNL6 = BNode()
BNL7 = BNode()
BNL8 = BNode()

g.add((lowFollowers, RDF.type, OWL.Class))
g.add((lowFollowers, RDFS.subClassOf, LNJP.Record))

g.add((lowFollowers, OWL.equivalentClass, BNL0))

g.add((BNL0, OWL.intersectionOf, BNL1))

g.add((BNL1, RDF.type, RDF.List))
g.add((BNL1, RDF.first, LNJP.Record))
g.add((BNL1, RDF.rest, BNL2))

g.add((BNL2, RDF.type, RDF.List))
g.add((BNL2, RDF.first, BNL3))
g.add((BNL2, RDF.rest, RDF.nil))

g.add((BNL3, RDF.type, OWL.Restriction))
g.add((BNL3, OWL.onProperty, follower_count_property))
g.add((BNL3, OWL.someValuesFrom, BNL4))

g.add((BNL4, RDF.type, RDFS.Datatype))
g.add((BNL4, OWL.onDatatype, XSD.integer))
g.add((BNL4, OWL.withRestrictions, BNL5))

g.add((BNL5, RDF.type, RDF.List))
g.add((BNL6, RDF.type, RDF.List))
g.add((BNL7, RDF.type, RDF.List))
g.add((BNL8, RDF.type, RDF.List))

g.add((BNL5, RDF.first, BNL6))
g.add((BNL5, RDF.rest, BNL7))

g.add((BNL7, RDF.first, BNL8))
g.add((BNL7, RDF.rest, RDF.nil))

g.add((BNL6, XSD.maxExclusive, Literal("201083", datatype=XSD.integer)))

for index, row in employee_counts.iterrows():
    company_record_id = "company_record_" + str(index)
    Record = URIRef(LNJP[company_record_id])
    g.add((Record, RDF.type, LNJP.Record))

    # Add data properties
    g.add((Record, LNJP['company_record_id'], Literal(index, datatype=XSD.integer)))
    g.add((Record, LNJP['employee_count'], Literal(int(row['employee_count']), datatype=XSD.integer)))
    g.add((Record, LNJP['follower_count'], Literal(int(row['follower_count']), datatype=XSD.integer)))
    g.add((Record, LNJP['time_recorded'], Literal(row['time_recorded'], datatype=XSD.date)))

    if (Literal(row['follower_count'], datatype=XSD.integer) >= Literal("201083", datatype=XSD.integer)):
        g.add((Record,RDF.type,LNJP.highFollowers))
    if (Literal(row['follower_count'], datatype=XSD.integer) < Literal("201083", datatype=XSD.integer)):
        g.add((Record,RDF.type,LNJP.lowFollowers))

    # Add object properties
    Company = URIRef(LNJP["company_" + str(int(row['company_id']))])
    g.add((Record, LNJP['IsForCompany'], Company))

turtle_file_path = 'employee_counts.ttl'


if os.path.exists(turtle_file_path):
    user_input = input(f"The file '{turtle_file_path}' already exists. Do you want to rewrite it? (y/n): ").lower()

    if user_input != 'y':
        print("Serialization not saved. Exiting.")
        exit()

print("--- saving serialization ---")
g.serialize(destination=turtle_file_path, format='turtle')

print(f"RDF data exported to {turtle_file_path}")
print("Serialization saved successfully.")

--- saving serialization ---
RDF data exported to employee_counts.ttl
Serialization saved successfully.


## Benefits

In [21]:
benefits = pd.read_csv(benefitsUrl, sep=',', index_col='benefit_id')

g = Graph()

g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("lnjp", LNJP)
g.bind("skos", SKOS)

In [22]:
for index, row in benefits.iterrows():
    benefit_id = "benefit_" + str(index)
    Benefit = URIRef(LNJP[benefit_id])
    g.add((Benefit, RDF.type, LNJP.Benefit))

    # Add data properties
    g.add((Benefit, LNJP['benefit_id'], Literal(index, datatype=XSD.integer)))
    g.add((Benefit, LNJP['inferred'], Literal(row['inferred'], datatype=XSD.boolean)))
    g.add((Benefit, LNJP['type'], Literal(row['type'], datatype=XSD.string)))

    # Add object properties
    JobPosting = URIRef(LNJP["job_" + str(row['job_id'])])
    g.add((Benefit, LNJP['isForJobPosting'], JobPosting))
    
turtle_file_path = 'benefits.ttl'
if os.path.exists(turtle_file_path):
    user_input = input(f"The file '{turtle_file_path}' already exists. Do you want to rewrite it? (y/n): ").lower()

    if user_input != 'y':
        print("Serialization not saved. Exiting.")
        exit()
print("--- saving serialization ---")
g.serialize(destination=turtle_file_path, format='turtle')

print(f"RDF data exported to {turtle_file_path}")
print("Serialization saved successfully.")



--- saving serialization ---
RDF data exported to benefits.ttl
Serialization saved successfully.


## Specialities

In [23]:
specialities = pd.read_csv(specialitiesUrl, sep=',', index_col='speciality_id', keep_default_na=False, na_values=['_'])

g = Graph()

g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("lnjp", LNJP)
g.bind("skos", SKOS)

In [24]:
for index, row in specialities.iterrows():
    speciality_id = "speciality_" + str(index)
    Speciality = URIRef(LNJP[speciality_id])
    g.add((Speciality, RDF.type, LNJP.Speciality))

    g.add((Speciality, LNJP['speciality_id'], Literal(index, datatype=XSD.integer)))
    g.add((Speciality, LNJP['speciality'], Literal(row['speciality'], datatype=XSD.string)))
turtle_file_path = 'speciality.ttl'

if os.path.exists(turtle_file_path):
    user_input = input(f"The file '{turtle_file_path}' already exists. Do you want to rewrite it? (y/n): ").lower()

    if user_input != 'y':
        print("Serialization not saved. Exiting.")
        exit()

print("--- saving serialization ---")
g.serialize(destination=turtle_file_path, format='turtle')

print(f"RDF data exported to {turtle_file_path}")
print("Serialization saved successfully.")


--- saving serialization ---
RDF data exported to speciality.ttl
Serialization saved successfully.


## Company_Specialities Join

In [25]:
company_specialities = pd.read_csv(company_specialitiesUrl, sep=',', index_col='company_id', keep_default_na=False, na_values=['_'])

g = Graph()

g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("lnjp", LNJP)
g.bind("skos", SKOS)

In [26]:

for index, row in company_specialities.iterrows():
    
    company_id = "company_" + str(index)
    Company = URIRef(LNJP[company_id])
    Speciality = URIRef(LNJP["speciality_" + str(row['speciality_id'])])
    g.add((Company, LNJP['hasSpeciality'], Speciality))
    
turtle_file_path = 'company_specialities_join.ttl'

if os.path.exists(turtle_file_path):
    user_input = input(f"The file '{turtle_file_path}' already exists. Do you want to rewrite it? (y/n): ").lower()

    if user_input != 'y':
        print("Serialization not saved. Exiting.")
        exit()
print("--- saving serialization ---")
g.serialize(destination=turtle_file_path, format='turtle')

print(f"RDF data exported to {turtle_file_path}")
print("Serialization saved successfully.")

--- saving serialization ---
RDF data exported to company_specialities_join.ttl
Serialization saved successfully.


## Salaries

In [27]:
salaries = pd.read_csv(salariesUrl, sep=',', index_col='salary_id')

g = Graph()

g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("lnjp", LNJP)
g.bind("skos", SKOS)

In [28]:
for index, row in salaries.iterrows():
    salary_id = "salary_" + str(index)
    Salary = URIRef(LNJP[salary_id])
    g.add((Salary, RDF.type, LNJP.Salary))

    # Add data properties
    g.add((Salary, LNJP['salary_id'], Literal(index, datatype=XSD.integer)))
    if not pd.isnull(row['max_salary']) and row['max_salary'] != "":
        g.add((Salary, LNJP['max_salary'], Literal(row['max_salary'], datatype=XSD.decimal)))
    if not pd.isnull(row['med_salary']) and row['med_salary'] != "":        
        g.add((Salary, LNJP['med_salary'], Literal(row['med_salary'], datatype=XSD.decimal)))
    if not pd.isnull(row['min_salary']) and row['min_salary'] != "":    
        g.add((Salary, LNJP['min_salary'], Literal(row['min_salary'], datatype=XSD.decimal)))
    g.add((Salary, LNJP['pay_period'], Literal(row['pay_period'], datatype=XSD.string)))
    g.add((Salary, LNJP['currency'], Literal(row['currency'], datatype=XSD.string)))
    g.add((Salary, LNJP['compensation_type'], Literal(row['compensation_type'], datatype=XSD.string)))

    # Add object properties
    JobPosting = URIRef(LNJP["job_" + str(row['job_id'])])
    g.add((Salary, LNJP['isAllocatedFor'], JobPosting))
turtle_file_path = 'salaries.ttl'

if os.path.exists(turtle_file_path):
    user_input = input(f"The file '{turtle_file_path}' already exists. Do you want to rewrite it? (y/n): ").lower()

    if user_input != 'y':
        print("Serialization not saved. Exiting.")
        exit()

print("--- saving serialization ---")
g.serialize(destination=turtle_file_path, format='turtle')

print(f"RDF data exported to {turtle_file_path}")
print("Serialization saved successfully.")

--- saving serialization ---
RDF data exported to salaries.ttl
Serialization saved successfully.
