In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import transformers

# Data Exploration

In [2]:
df = pd.read_csv("../Static Data/indeed.csv")
df.head()

Unnamed: 0,Job Title,Job Description,Job Type,Categories,Location,City,State,Country,Zip Code,Address,...,Employer Phone,Employer Logo,Companydescription,Employer Location,Employer City,Employer State,Employer Country,Employer Zip Code,Uniq Id,Crawl Timestamp
0,Shift Manager,"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Mission Hills, CA 91345",Mission Hills,CA,United States,91345.0,,...,,https://d2q79iu7y748jz.cloudfront.net/s/_squar...,Del Taco is an American quick service restaura...,"Mission Hills, CA 91345",Mission Hills,CA,United States,91345.0,511f9a53920f4641d701d51d3589349f,2019-08-24 09:13:18 +0000
1,Operations Support Manager,"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Atlanta, GA 30342",Atlanta,GA,United States,30342.0,,...,,https://d2q79iu7y748jz.cloudfront.net/s/_logo/...,"Based in Atlanta, FOCUS Brands Inc. is an inno...",,,,United States,,4955daf0a3facbe2acb6c429ba394e6d,2019-09-19 08:16:55 +0000
2,Senior Product Manager - Data,"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Chicago, IL",Chicago,IL,United States,,,...,,,Vibes Corp. reputation was built and establish...,,,,United States,,a0e0d12df1571962b785f17f43ceae12,2019-09-18 02:13:10 +0000
3,Part-Time Office Concierge,"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Festus, MO",Festus,MO,United States,,,...,,,,,,,United States,,56e411fd731f76ac916bf4fb169250e9,2019-10-24 16:39:13 +0000
4,Print & Marketing Associate,"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Cedar Rapids, IA 52404",Cedar Rapids,IA,United States,52404.0,,...,,https://d2q79iu7y748jz.cloudfront.net/s/_logo/...,"Staples is The Worklife Fulfillment Company, h...","Cedar Rapids, IA 52404",Cedar Rapids,IA,United States,52404.0,3fff5c0ad6981bf4bff6260bd5feab63,2019-08-24 22:29:10 +0000


In [3]:
list(df.columns)

['Job Title',
 'Job Description',
 'Job Type',
 'Categories',
 'Location',
 'City',
 'State',
 'Country',
 'Zip Code',
 'Address',
 'Salary From',
 'Salary To',
 'Salary Period',
 'Apply Url',
 'Apply Email',
 'Employees',
 'Industry',
 'Company Name',
 'Employer Email',
 'Employer Website',
 'Employer Phone',
 'Employer Logo',
 'Companydescription',
 'Employer Location',
 'Employer City',
 'Employer State',
 'Employer Country',
 'Employer Zip Code',
 'Uniq Id',
 'Crawl Timestamp']

In [4]:
df.shape

(999, 30)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 30 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Job Title           999 non-null    object 
 1   Job Description     999 non-null    object 
 2   Job Type            0 non-null      float64
 3   Categories          0 non-null      float64
 4   Location            999 non-null    object 
 5   City                999 non-null    object 
 6   State               999 non-null    object 
 7   Country             999 non-null    object 
 8   Zip Code            546 non-null    object 
 9   Address             0 non-null      float64
 10  Salary From         0 non-null      float64
 11  Salary To           0 non-null      float64
 12  Salary Period       0 non-null      float64
 13  Apply Url           653 non-null    object 
 14  Apply Email         0 non-null      float64
 15  Employees           0 non-null      float64
 16  Industry

# Get Text Data

In [6]:
df_text = df[['Job Title', 'Job Description']]
df_text.head()

Unnamed: 0,Job Title,Job Description
0,Shift Manager,"<div id=""jobDescriptionText"" class=""jobsearch-..."
1,Operations Support Manager,"<div id=""jobDescriptionText"" class=""jobsearch-..."
2,Senior Product Manager - Data,"<div id=""jobDescriptionText"" class=""jobsearch-..."
3,Part-Time Office Concierge,"<div id=""jobDescriptionText"" class=""jobsearch-..."
4,Print & Marketing Associate,"<div id=""jobDescriptionText"" class=""jobsearch-..."


In [7]:
# process Job Description
def process_html(txt):
    result = re.sub(".*?<.*?>","",txt)
    return result

In [8]:
sample_text = process_html(df_text['Job Description'][0])
print(sample_text)

WE ARE LOOKING FOR TOP PERFORMERS TO GROW WITH US!

Del Taco needs people with the ability to handle multiple responsibilities and deliver the quality and guest service that defines our brand. While we look for you to bring the right skills and personality for a Shift Manager position, we'll also provide the training and support you need to continue to grow and advance.

Our Shift Managers support our stores and General Managers in creating an amazing guest experience, and empowering their team to be at their very best. Our Shift Managers create a culture in our restaurants that is exciting, optimistic and rewarding.

Del Taco Shift Managers have significant opportunities for growth and career advancement and are charged with supporting the General Manager with the tasks of training the next generation of leaders, guiding them to be promoted and empowering them to have a successful career.


Supports General Manager in supervising team that delivers effective results in the areas of qu

# Classification Model

https://huggingface.co/ml6team/keyphrase-extraction-kbir-inspec

In [9]:
from transformers import (
    TokenClassificationPipeline,
    AutoModelForTokenClassification,
    AutoTokenizer,
)
from transformers.pipelines import AggregationStrategy
import numpy as np

# Define keyphrase extraction pipeline
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )

    def postprocess(self, all_outputs):
        results = super().postprocess(
            all_outputs=all_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE,
        )
        return np.unique([result.get("word").strip() for result in results])

# Load pipeline
model_name = "ml6team/keyphrase-extraction-kbir-inspec"
extractor = KeyphraseExtractionPipeline(model=model_name)

In [10]:
keyphrases = extractor(sample_text)

print(keyphrases)

['Company operation policies' 'Del Taco' 'GM' 'General Manager'
 'Shift Manager' 'career advancement' 'cleanliness' 'compliance'
 'disciplinary action' 'food cost' 'food handling' 'guest service'
 'labor cost' 'product preparation' 'quick service restaurant environment'
 'safety' 'sanitations standards' 'security']


In [18]:
tokenizer=AutoTokenizer.from_pretrained(model_name)
tokenizer.encode(sample_text)

Token indices sequence length is longer than the specified maximum sequence length for this model (1167 > 512). Running this sequence through the model will result in indexing errors


[0,
 10284,
 10616,
 33584,
 1862,
 5089,
 17918,
 18578,
 38036,
 4322,
 3842,
 8837,
 4581,
 17345,
 382,
 328,
 50121,
 50118,
 50121,
 50118,
 21502,
 23206,
 782,
 82,
 19,
 5,
 1460,
 7,
 3679,
 1533,
 9582,
 8,
 2438,
 5,
 1318,
 8,
 4910,
 544,
 14,
 19857,
 84,
 1518,
 4,
 616,
 52,
 356,
 13,
 47,
 7,
 836,
 5,
 235,
 2417,
 8,
 6718,
 13,
 10,
 36790,
 4827,
 737,
 6,
 52,
 581,
 67,
 694,
 5,
 1058,
 8,
 323,
 47,
 240,
 7,
 535,
 7,
 1733,
 8,
 3316,
 4,
 50121,
 50118,
 50121,
 50118,
 2522,
 36790,
 1554,
 14979,
 323,
 84,
 2326,
 8,
 1292,
 1554,
 14979,
 11,
 2351,
 41,
 2770,
 4910,
 676,
 6,
 8,
 18503,
 49,
 165,
 7,
 28,
 23,
 49,
 182,
 275,
 4,
 1541,
 36790,
 1554,
 14979,
 1045,
 10,
 2040,
 11,
 84,
 4329,
 14,
 16,
 3571,
 6,
 7168,
 8,
 16044,
 4,
 50121,
 50118,
 50121,
 50118,
 21502,
 23206,
 36790,
 1554,
 14979,
 33,
 1233,
 1616,
 13,
 434,
 8,
 756,
 19026,
 8,
 32,
 1340,
 19,
 3117,
 5,
 1292,
 4827,
 19,
 5,
 8558,
 9,
 1058,
 5,
 220,
 2706,
 9,


In [17]:
# encode the sample_text using distilbert

import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the tokenizer and model  

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")

tokenizer.encode(sample_text)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (983 > 512). Running this sequence through the model will result in indexing errors


[101,
 2057,
 2024,
 2559,
 2005,
 2327,
 9567,
 2000,
 4982,
 2007,
 2149,
 999,
 3972,
 11937,
 3597,
 3791,
 2111,
 2007,
 1996,
 3754,
 2000,
 5047,
 3674,
 10198,
 1998,
 8116,
 1996,
 3737,
 1998,
 4113,
 2326,
 2008,
 11859,
 2256,
 4435,
 1012,
 2096,
 2057,
 2298,
 2005,
 2017,
 2000,
 3288,
 1996,
 2157,
 4813,
 1998,
 6180,
 2005,
 1037,
 5670,
 3208,
 2597,
 1010,
 2057,
 1005,
 2222,
 2036,
 3073,
 1996,
 2731,
 1998,
 2490,
 2017,
 2342,
 2000,
 3613,
 2000,
 4982,
 1998,
 5083,
 1012,
 2256,
 5670,
 10489,
 2490,
 2256,
 5324,
 1998,
 2236,
 10489,
 1999,
 4526,
 2019,
 6429,
 4113,
 3325,
 1010,
 1998,
 7861,
 23948,
 2037,
 2136,
 2000,
 2022,
 2012,
 2037,
 2200,
 2190,
 1012,
 2256,
 5670,
 10489,
 3443,
 1037,
 3226,
 1999,
 2256,
 7884,
 2008,
 2003,
 10990,
 1010,
 21931,
 1998,
 10377,
 2075,
 1012,
 3972,
 11937,
 3597,
 5670,
 10489,
 2031,
 3278,
 6695,
 2005,
 3930,
 1998,
 2476,
 12607,
 1998,
 2024,
 5338,
 2007,
 4637,
 1996,
 2236,
 3208,
 2007,
 1996,
 8