In [1]:
from transformers import (
    TokenClassificationPipeline,
    AutoModelForTokenClassification,
    AutoTokenizer,
)
from transformers.pipelines import AggregationStrategy
from transformers import AutoTokenizer

from sentence_transformers import SentenceTransformer

import numpy as np
import polars as pl

# Define keyphrase extraction pipeline
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )

    def postprocess(self, all_outputs):
        results = super().postprocess(
            all_outputs=all_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE,
        )
        keywords = list(np.unique([result.get("word").strip() for result in results]))
        keywords_str = ';'.join(keywords)
        
        return keywords_str

# this function encodes the given text using the sentence transformer and truncates/pads as necessary
def encode(text):
    enc = [x.item() for x in list(tokenizer.encode(text))]
    return enc
    
inputs = '../NewData/jobs_all.csv'
output = 'tokenized_data_csv'

tokenizer = SentenceTransformer("msmarco-distilbert-dot-v5")

model_name = "ml6team/keyphrase-extraction-kbir-inspec"
extractor = KeyphraseExtractionPipeline(model=model_name)

In [2]:
# read in data
df = pl.scan_csv(inputs)
df = df.drop_nulls(subset='description')

In [3]:
df.collect()

job_id,company_id,title,description,max_salary,med_salary,min_salary,pay_period,formatted_work_type,location,state_abbr,applies,original_listed_time,remote_allowed,views,job_posting_url,application_url,application_type,expiry,closed_time,formatted_experience_level,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,scraped,industry_id,industry_name,skill_abr,skill_name,company_name,company_description,company_size,company_state,company_country,company_city,company_zipcode,company_address,company_url,company_industry,company_employee_count,company_follower_count,company_time_recorded
i64,i64,str,str,f64,f64,f64,str,str,str,str,i64,f64,i64,i64,str,str,str,f64,f64,str,str,f64,str,i64,str,str,str,i64,i64,str,str,str,str,str,i64,str,str,str,str,str,str,str,i64,i64,i64
3757940104,553718,"""Hearing Care P…","""Overview Hear…",,5250.0,,"""MONTHLY""","""Full-time""","""Little River, …","""SC""",,1.7000e12,,9,"""https://www.li…","""https://career…","""OffsiteApply""",1.7000e12,,"""Entry level""",,1.7000e12,"""careers-demant…",0,"""FULL_TIME""","""USD""","""BASE_SALARY""",1699138101,17,"""Medical Equipm…","""OTHR""","""Other""","""HearingLife""","""HearingLife is…",5,"""New Jersey""","""US""","""Somerset""","""8873""","""580 Howard Ave…","""https://www.li…","""Retail""",1171,11417,1699131481
3757940025,2192142,"""Shipping & Rec…","""Metalcraft of …",,,,,"""Full-time""","""Beaver Dam, WI…","""WI""",,1.7000e12,,,"""https://www.li…","""https://www.cl…","""OffsiteApply""",1.7000e12,,,,1.7000e12,"""www.click2appl…",0,"""FULL_TIME""",,,1699085420,135,"""Industrial Mac…","""MGMT""","""Management""","""Metalcraft of …","""Headquartered …",4,"""WI""","""US""","""Mayville""","""53050""","""1000 Metalcraf…","""https://www.li…","""Industrial Mac…",300,2923,1699085420
3757938019,474443,"""Manager, Engin…",""" The TSUBAKI n…",,,,,"""Full-time""","""Bessemer, AL""","""AL""",,1.7000e12,,,"""https://www.li…","""https://www.cl…","""OffsiteApply""",1.7000e12,,,"""Bachelor's Deg…",1.7000e12,"""www.click2appl…",0,"""FULL_TIME""",,,1699085644,147,"""Automation Mac…","""ENG""","""Engineering""","""U.S. Tsubaki P…","""U.S. Tsubaki P…",4,"""Illinois""","""US""","""Wheeling""","""60090""","""301E Marquardt…","""https://www.li…","""Automation Mac…",314,8487,1699085644
3757938018,18213359,"""Cook""","""descriptionTit…",,22.27,,"""HOURLY""","""Full-time""","""Aliso Viejo, C…","""CA""",,1.7000e12,,1,"""https://www.li…","""https://jobs.a…","""OffsiteApply""",1.7000e12,,"""Entry level""",,1.7000e12,"""jobs.apploi.co…",0,"""FULL_TIME""","""USD""","""BASE_SALARY""",1699087461,100,"""Non-profit Org…","""MGMT""","""Management""","""Episcopal Comm…","""Episcopal Comm…",4,"""California""","""US""","""Altadena""","""91001""","""2212 El Molino…","""https://www.li…","""Non-profit Org…",36,305,1692863696
3757937095,437225,"""Principal Clou…","""Job Summary At…",275834.0,,205956.0,"""YEARLY""","""Full-time""","""United States""","""US""",,1.7000e12,1,,"""https://www.li…","""https://career…","""OffsiteApply""",1.7000e12,,"""Mid-Senior lev…",,1.7000e12,"""careers.iherb.…",0,"""FULL_TIME""","""USD""","""BASE_SALARY""",1699085346,27,"""Retail""","""IT""","""Information Te…","""iHerb, LLC""","""iHerb is on a …",5,"""California""","""US""","""Irvine""","""92618""","""17400 Laguna C…","""https://www.li…","""Retail""",1227,51933,1692863726
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
133114754,77766802,"""Sales Manager""","""Are you a dyna…",,,,,"""Full-time""","""Santa Clarita,…","""CA""",,1.6900e12,,,"""https://www.li…",,"""ComplexOnsiteA…",1.7000e12,,,,1.6900e12,,0,"""FULL_TIME""",,,1,92,"""Truck Transpor…","""SALE""","""Sales""","""CargoLogin.""","""CargoLogin is …",1,"""California""","""US""","""Santa Clarita""","""28358""","""Constellation …","""https://www.li…","""Transportation…",15,159,1692834657
108965123,,"""Office Adminis…","""A fast-fashion…",,,,,"""Full-time""","""New York, NY""","""NY""",2,1.7000e12,,4,"""https://www.li…",,"""ComplexOnsiteA…",1.7000e12,,,,1.7000e12,,0,"""FULL_TIME""",,,1699044401,19,"""Retail Apparel…","""ADM""","""Administrative…",,,,,,,,,,,,,
102339515,52132271,"""Franchise Owne…","""DuctVentz is a…",,,,,"""Full-time""","""Greater Boston…","""MA""",,1.7000e12,,,"""https://www.li…",,"""SimpleOnsiteAp…",1.7000e12,,,,1.7000e12,,0,"""FULL_TIME""",,,1699063495,91,"""Consumer Servi…","""BD""","""Business Devel…","""DryerVentz - D…","""DryerVentz ope…",1,"""0""","""US""","""New York""","""0""","""0""","""https://www.li…","""Consumer Servi…",7,28,1699063495
85008768,,"""Licensed Insur…","""While many ind…",52000.0,,45760.0,"""YEARLY""","""Full-time""","""Chico, CA""","""CA""",,1.6900e12,,5,"""https://www.li…",,"""ComplexOnsiteA…",1.7100e12,,,,1.6900e12,,1,"""FULL_TIME""","""USD""","""BASE_SALARY""",1,42,"""Insurance""","""SALE""","""Sales""",,,,,,,,,,,,,


In [4]:
df_head = df.head(100)

In [5]:
new_df = df_head.with_columns(
    [pl.col("description").map_elements(encode, return_dtype=pl.List(pl.Float64)).alias('Encoding'),
     pl.col("description").map_elements(extractor, return_dtype=pl.Utf8).alias('Keyphrases')]
)

In [6]:
collected = new_df.collect()
collected

job_id,company_id,title,description,max_salary,med_salary,min_salary,pay_period,formatted_work_type,location,state_abbr,applies,original_listed_time,remote_allowed,views,job_posting_url,application_url,application_type,expiry,closed_time,formatted_experience_level,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,scraped,industry_id,industry_name,skill_abr,skill_name,company_name,company_description,company_size,company_state,company_country,company_city,company_zipcode,company_address,company_url,company_industry,company_employee_count,company_follower_count,company_time_recorded,Encoding,Keyphrases
i64,i64,str,str,f64,f64,f64,str,str,str,str,i64,f64,i64,i64,str,str,str,f64,f64,str,str,f64,str,i64,str,str,str,i64,i64,str,str,str,str,str,i64,str,str,str,str,str,str,str,i64,i64,i64,list[f64],str
3757940104,553718,"""Hearing Care P…","""Overview Hear…",,5250.0,,"""MONTHLY""","""Full-time""","""Little River, …","""SC""",,1.7000e12,,9,"""https://www.li…","""https://career…","""OffsiteApply""",1.7000e12,,"""Entry level""",,1.7000e12,"""careers-demant…",0,"""FULL_TIME""","""USD""","""BASE_SALARY""",1699138101,17,"""Medical Equipm…","""OTHR""","""Other""","""HearingLife""","""HearingLife is…",5,"""New Jersey""","""US""","""Somerset""","""8873""","""580 Howard Ave…","""https://www.li…","""Retail""",1171,11417,1699131481,"[0.437406, 0.201353, … -0.396777]","""Demant Group;H…"
3757940025,2192142,"""Shipping & Rec…","""Metalcraft of …",,,,,"""Full-time""","""Beaver Dam, WI…","""WI""",,1.7000e12,,,"""https://www.li…","""https://www.cl…","""OffsiteApply""",1.7000e12,,,,1.7000e12,"""www.click2appl…",0,"""FULL_TIME""",,,1699085420,135,"""Industrial Mac…","""MGMT""","""Management""","""Metalcraft of …","""Headquartered …",4,"""WI""","""US""","""Mayville""","""53050""","""1000 Metalcraf…","""https://www.li…","""Industrial Mac…",300,2923,1699085420,"[0.146735, 0.7893, … -0.224941]","""Lean manufactu…"
3757938019,474443,"""Manager, Engin…",""" The TSUBAKI n…",,,,,"""Full-time""","""Bessemer, AL""","""AL""",,1.7000e12,,,"""https://www.li…","""https://www.cl…","""OffsiteApply""",1.7000e12,,,"""Bachelor's Deg…",1.7000e12,"""www.click2appl…",0,"""FULL_TIME""",,,1699085644,147,"""Automation Mac…","""ENG""","""Engineering""","""U.S. Tsubaki P…","""U.S. Tsubaki P…",4,"""Illinois""","""US""","""Wheeling""","""60090""","""301E Marquardt…","""https://www.li…","""Automation Mac…",314,8487,1699085644,"[0.062573, 0.603351, … -0.013101]","""SU;automotive …"
3757938018,18213359,"""Cook""","""descriptionTit…",,22.27,,"""HOURLY""","""Full-time""","""Aliso Viejo, C…","""CA""",,1.7000e12,,1,"""https://www.li…","""https://jobs.a…","""OffsiteApply""",1.7000e12,,"""Entry level""",,1.7000e12,"""jobs.apploi.co…",0,"""FULL_TIME""","""USD""","""BASE_SALARY""",1699087461,100,"""Non-profit Org…","""MGMT""","""Management""","""Episcopal Comm…","""Episcopal Comm…",4,"""California""","""US""","""Altadena""","""91001""","""2212 El Molino…","""https://www.li…","""Non-profit Org…",36,305,1692863696,"[-0.176124, 0.375635, … -0.442146]","""Covington;The;…"
3757937095,437225,"""Principal Clou…","""Job Summary At…",275834.0,,205956.0,"""YEARLY""","""Full-time""","""United States""","""US""",,1.7000e12,1,,"""https://www.li…","""https://career…","""OffsiteApply""",1.7000e12,,"""Mid-Senior lev…",,1.7000e12,"""careers.iherb.…",0,"""FULL_TIME""","""USD""","""BASE_SALARY""",1699085346,27,"""Retail""","""IT""","""Information Te…","""iHerb, LLC""","""iHerb is on a …",5,"""California""","""US""","""Irvine""","""92618""","""17400 Laguna C…","""https://www.li…","""Retail""",1227,51933,1692863726,"[-0.300732, 0.269816, … -0.227037]","""Amazon Web Ser…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
3757930108,73013724,"""Sales Manager""","""Position Summa…",350000.0,,125000.0,"""YEARLY""","""Full-time""","""Poughkeepsie, …","""NY""",,1.7000e12,1,,"""https://www.li…",,"""ComplexOnsiteA…",1.7000e12,,"""Mid-Senior lev…",,1.7000e12,,0,"""FULL_TIME""","""USD""","""BASE_SALARY""",1699131404,43,"""Financial Serv…","""SALE""","""Sales""","""J. Galt""","""Our mission is…",3,"""Indiana""","""US""","""Indianapolis""","""46268""","""3500 Depauw Bl…","""https://www.li…","""Financial Serv…",251,28671,1692684533,"[-0.483471, 0.56759, … -0.581419]","""consulting ser…"
3757929978,3603539,"""Maintenance Te…","""Job Type Full…",,,,,"""Full-time""","""Milwaukee, WI""","""WI""",,1.7000e12,,,"""https://www.li…","""https://recrui…","""OffsiteApply""",1.7000e12,,"""Entry level""",,1.7000e12,"""recruiting.pay…",0,"""FULL_TIME""",,,1699131782,44,"""Real Estate""","""MGMT""","""Management""","""Roers Companie…","""Roers Companie…",3,"""Minnesota""","""US""","""Plymouth""","""55447""","""Two Carlson Pa…","""https://www.li…","""Real Estate""",193,6680,1699131782,"[0.212587, 0.508245, … -0.447771]","""Community With…"
3757929967,6577380,"""CDL Class A Dr…","""We are looking…",,28.0,,"""HOURLY""","""Full-time""","""Oakland, CA""","""CA""",,1.7000e12,,,"""https://www.li…","""https://recrui…","""OffsiteApply""",1.7000e12,,,,1.7000e12,"""recruit.zoho.c…",0,"""FULL_TIME""","""USD""","""BASE_SALARY""",1699137050,92,"""Truck Transpor…","""MGMT""","""Management""","""Conexwest""","""Conexwest is t…",2,"""California""","""US""","""Oakland""","""94607""","""2100 Engineer …","""https://www.li…","""Retail Office …",48,2848,1699084529,"[-0.130982, 0.256337, … -0.668071]","""Disability ins…"
3757929959,73013724,"""Sales Manager""","""Position Summa…",350000.0,,125000.0,"""YEARLY""","""Full-time""","""Winchester, VA…","""VA""",,1.7000e12,1,,"""https://www.li…",,"""ComplexOnsiteA…",1.7000e12,,"""Mid-Senior lev…",,1.7000e12,,0,"""FULL_TIME""","""USD""","""BASE_SALARY""",1699087166,43,"""Financial Serv…","""SALE""","""Sales""","""J. Galt""","""Our mission is…",3,"""Indiana""","""US""","""Indianapolis""","""46268""","""3500 Depauw Bl…","""https://www.li…","""Financial Serv…",251,28671,1692684533,"[-0.483471, 0.56759, … -0.581419]","""consulting ser…"


In [7]:
collected.write_parquet('processed.parquet')