In [23]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("arshkon/linkedin-job-postings")

print("Path to dataset files:", path)

Path to dataset files: /Users/joeportnoy/.cache/kagglehub/datasets/arshkon/linkedin-job-postings/versions/13


In [2]:
import pandas as pd

# Load your dataset
df = pd.read_csv('../resources/postings.csv')

# Show the column names
print(df.columns)

Index(['job_id', 'company_name', 'title', 'description', 'max_salary',
       'pay_period', 'location', 'company_id', 'views', 'med_salary',
       'min_salary', 'formatted_work_type', 'applies', 'original_listed_time',
       'remote_allowed', 'job_posting_url', 'application_url',
       'application_type', 'expiry', 'closed_time',
       'formatted_experience_level', 'skills_desc', 'listed_time',
       'posting_domain', 'sponsored', 'work_type', 'currency',
       'compensation_type', 'normalized_salary', 'zip_code', 'fips'],
      dtype='object')


In [3]:
import re

# Example job titles you're searching for
job_titles = [
    "data analyst",
    "business intelligence analyst",
    "marketing analyst",
    "junior data scientist",
    "data analytics intern"
]

# Compile regex pattern (escaped, joined with OR "|")
pattern = "|".join([re.escape(title) for title in job_titles])

# Filter the DataFrame
filtered_df = df[
    df["title"].str.contains(pattern, case=False, na=False)
]

# Optional: Reset index
filtered_df.reset_index(drop=True, inplace=True)

filtered_df

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
0,3813645405,Tenazx Inc,Data Analyst,Job Title: Data AnalystDuration: ContractLocat...,,,"Queens, NY",82537206.0,4.0,,...,,1.713465e+12,,0,CONTRACT,,,,11427.0,36081.0
1,3872787865,Radiant Systems Inc,eCommerce Data Analyst,Job Description: JOB SUMMARY:Responsible for c...,,,United States,3342169.0,102.0,,...,,1.713278e+12,,0,CONTRACT,,,,,
2,3884432522,MCubeSoft,Wealth Management Business Data Analyst,Job Title: BUSINESS DATA ANALYST Location: Aus...,,,"Charlotte, NC",74604183.0,8.0,,...,,1.712346e+12,,0,FULL_TIME,,,,28202.0,37119.0
3,3884434978,Outlier,"Senior Data Analyst - AI Training (Remote, Con...",Job Type: Contract\n\nWe don’t sponsor work vi...,,HOURLY,United States,92583550.0,258.0,55.0,...,,1.712350e+12,boards.greenhouse.io,0,CONTRACT,USD,BASE_SALARY,114400.0,,
4,3884435156,Insight Global,Business Intelligence Analyst,"Title: BI AnalystLocation: Fishers, IN Must Ha...",67000.0,YEARLY,"Fishers, IN",11056.0,83.0,,...,,1.712347e+12,,0,FULL_TIME,USD,BASE_SALARY,63500.0,46037.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
472,3906240634,Reliance Matrix,Business Intelligence Analyst,Job Responsibilities And Requirements\n\nThe B...,,,"Philadelphia, PA",92709511.0,3.0,,...,,1.713565e+12,rsli.wd5.myworkdayjobs.com,0,FULL_TIME,,,,19102.0,42101.0
473,3906246968,Cisco Meraki,Data Analyst,Who We Are:\n\nOur employees fuel the magic of...,,,United States,92950.0,4.0,,...,,1.713566e+12,meraki.cisco.com,0,FULL_TIME,,,,,
474,3906249532,orangepeople,Data Analyst,We are looking for a passionate certified data...,89.0,HOURLY,"Orlando, FL",2708565.0,,,...,,1.713565e+12,,0,CONTRACT,USD,BASE_SALARY,179920.0,32801.0,12095.0
475,3906251726,CHRISTUS Health,Data Analyst II - Staff Credentialing,Description\n\nSummary:\n\nThis position is re...,,,"Irving, TX",10769.0,4.0,,...,,1.713567e+12,careers.christushealth.org,0,FULL_TIME,,,,75038.0,48113.0


In [4]:
# Keep only useful columns for the ML model
useful_columns = [
    "title", "description", "skills_desc",
    "work_type", "location", "company_name"
]

df_clean = filtered_df[useful_columns].copy()

In [5]:
# Drop missing or empty descriptions
df_clean.dropna(subset=["description"], inplace=True)
df_clean = df_clean[df_clean["description"].str.strip() != ""]

In [6]:
#Fill data with unknown for skills
df_clean["skills_desc"] = df_clean["skills_desc"].fillna("")

# Optionally fill location or work_type
df_clean["location"] = df_clean["location"].fillna("Unknown")
df_clean["work_type"] = df_clean["work_type"].fillna("Unknown")

In [7]:
# Normalize text for NLP
for col in ["title", "description", "skills_desc", "location", "work_type", "company_name"]:
    df_clean[col] = df_clean[col].astype(str).str.strip().str.lower()

In [8]:
# Combine into single text field for vectorizing
df_clean["job_text"] = (
    df_clean["title"] + " " +
    df_clean["skills_desc"] + " " +
    df_clean["description"]
)

In [9]:
# Remove duplicates
df_clean.drop_duplicates(subset=["job_text"], inplace=True)

In [10]:
df_clean.to_csv("cleaned_jd_data.csv", index=False)