In [53]:
import nltk
import numpy as np
import pandas as pd
import argparse
import string
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

stemmer = nltk.stem.snowball.EnglishStemmer()

In [48]:
closed_class_stop_words = ['a','the','an','and','or','but','about','above','after','along','amid','among',\
                           'as','at','by','for','from','in','into','like','minus','near','of','off','on',\
                           'onto','out','over','past','per','plus','since','till','to','under','until','up',\
                           'via','vs','with','that','can','cannot','could','may','might','must',\
                           'need','ought','shall','should','will','would','have','had','has','having','be',\
                           'is','am','are','was','were','being','been','get','gets','got','gotten',\
                           'getting','seem','seeming','seems','seemed',\
                           'enough', 'both', 'all', 'your' 'those', 'this', 'these', \
                           'their', 'the', 'that', 'some', 'our', 'no', 'neither', 'my',\
                           'its', 'his' 'her', 'every', 'either', 'each', 'any', 'another',\
                           'an', 'a', 'just', 'mere', 'such', 'merely' 'right', 'no', 'not',\
                           'only', 'sheer', 'even', 'especially', 'namely', 'as', 'more',\
                           'most', 'less' 'least', 'so', 'enough', 'too', 'pretty', 'quite',\
                           'rather', 'somewhat', 'sufficiently' 'same', 'different', 'such',\
                           'when', 'why', 'where', 'how', 'what', 'who', 'whom', 'which',\
                           'whether', 'why', 'whose', 'if', 'anybody', 'anyone', 'anyplace', \
                           'anything', 'anytime' 'anywhere', 'everybody', 'everyday',\
                           'everyone', 'everyplace', 'everything' 'everywhere', 'whatever',\
                           'whenever', 'whereever', 'whichever', 'whoever', 'whomever' 'he',\
                           'him', 'his', 'her', 'she', 'it', 'they', 'them', 'its', 'their','theirs',\
                           'you','your','yours','me','my','mine','I','we','us','much','and/or'
                           ]

stop_words = set([*nltk.corpus.stopwords.words('english'),
                *closed_class_stop_words])

In [49]:
file_path = 'job_postings_salary.txt'  
all_postings = pd.read_csv(file_path)

all_postings.head()

Unnamed: 0,description,annual_salary,salary_range
0,Overview\n\nHearingLife is a national hearing ...,63000.0,50k-100k
1,descriptionTitle\n\n Looking for a great oppor...,46321.6,0-50k
2,"Job Summary\nAt iHerb, we are on a mission to ...",240895.0,200k-250k
3,"descriptionTitle\n\n $2,000 Sign-on Bonus Guar...",40144.0,0-50k
4,Who We Are\n\nEscalent is an award-winning dat...,61000.0,50k-100k


In [50]:
def remove_emojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  
                           u"\U0001F300-\U0001F5FF"  
                           u"\U0001F680-\U0001F6FF"  
                           u"\U0001F700-\U0001F77F"  
                           u"\U0001F780-\U0001F7FF"  
                           u"\U0001F800-\U0001F8FF"  
                           u"\U0001F900-\U0001F9FF"  
                           u"\U0001FA00-\U0001FA6F"  
                           u"\U0001FA70-\U0001FAFF"  
                           u"\U00002702-\U000027B0"  
                           u"\U000024C2-\U0001F251" 
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [54]:
def preprocess_text(text):
    text_no_emojis = remove_emojis(text)
    tokens = word_tokenize(text_no_emojis)
    processed_tokens = [stemmer.stem(token.lower()) for token in tokens 
                        if token.lower() not in stop_words and token not in string.punctuation]
    return ' '.join(processed_tokens)

In [55]:
all_postings['processed_description'] = all_postings['description'].apply(preprocess_text)

print(all_postings[['description', 'processed_description']].head())

                                         description  \
0  Overview\n\nHearingLife is a national hearing ...   
1  descriptionTitle\n\n Looking for a great oppor...   
2  Job Summary\nAt iHerb, we are on a mission to ...   
3  descriptionTitle\n\n $2,000 Sign-on Bonus Guar...   
4  Who We Are\n\nEscalent is an award-winning dat...   

                               processed_description  
0  overview hearinglif nation hear care compani p...  
1  descriptiontitl look great opportun develop pr...  
2  job summari iherb mission make health well acc...  
3  descriptiontitl 2,000 sign-on bonus guarante l...  
4  escal award-win data analyt advisori firm help...  


In [56]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_postings['processed_description'])

print(tfidf_matrix.shape)

(13350, 89302)


In [62]:
feature_names = vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

print(tfidf_df.head()) 

    00       000  000000  00004258  00008592  0000estim  0001  000122  \
0  0.0  0.000000     0.0       0.0       0.0        0.0   0.0     0.0   
1  0.0  0.000000     0.0       0.0       0.0        0.0   0.0     0.0   
2  0.0  0.010181     0.0       0.0       0.0        0.0   0.0     0.0   
3  0.0  0.085519     0.0       0.0       0.0        0.0   0.0     0.0   
4  0.0  0.070960     0.0       0.0       0.0        0.0   0.0     0.0   

   00017837  0003  ...  école  éducat   él  éste  éstos  única  škoda  ǣcode  \
0       0.0   0.0  ...    0.0     0.0  0.0   0.0    0.0    0.0    0.0    0.0   
1       0.0   0.0  ...    0.0     0.0  0.0   0.0    0.0    0.0    0.0    0.0   
2       0.0   0.0  ...    0.0     0.0  0.0   0.0    0.0    0.0    0.0    0.0   
3       0.0   0.0  ...    0.0     0.0  0.0   0.0    0.0    0.0    0.0    0.0   
4       0.0   0.0  ...    0.0     0.0  0.0   0.0    0.0    0.0    0.0    0.0   

   ǣpresenc  ǣscan  
0       0.0    0.0  
1       0.0    0.0  
2       0.0    0.