In [41]:
import nltk
import numpy as np
import pandas as pd
import argparse
import string
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

stemmer = nltk.stem.snowball.EnglishStemmer()

In [42]:
closed_class_stop_words = ['a','the','an','and','or','but','about','above','after','along','amid','among',\
                           'as','at','by','for','from','in','into','like','minus','near','of','off','on',\
                           'onto','out','over','past','per','plus','since','till','to','under','until','up',\
                           'via','vs','with','that','can','cannot','could','may','might','must',\
                           'need','ought','shall','should','will','would','have','had','has','having','be',\
                           'is','am','are','was','were','being','been','get','gets','got','gotten',\
                           'getting','seem','seeming','seems','seemed',\
                           'enough', 'both', 'all', 'your' 'those', 'this', 'these', \
                           'their', 'the', 'that', 'some', 'our', 'no', 'neither', 'my',\
                           'its', 'his' 'her', 'every', 'either', 'each', 'any', 'another',\
                           'an', 'a', 'just', 'mere', 'such', 'merely' 'right', 'no', 'not',\
                           'only', 'sheer', 'even', 'especially', 'namely', 'as', 'more',\
                           'most', 'less' 'least', 'so', 'enough', 'too', 'pretty', 'quite',\
                           'rather', 'somewhat', 'sufficiently' 'same', 'different', 'such',\
                           'when', 'why', 'where', 'how', 'what', 'who', 'whom', 'which',\
                           'whether', 'why', 'whose', 'if', 'anybody', 'anyone', 'anyplace', \
                           'anything', 'anytime' 'anywhere', 'everybody', 'everyday',\
                           'everyone', 'everyplace', 'everything' 'everywhere', 'whatever',\
                           'whenever', 'whereever', 'whichever', 'whoever', 'whomever' 'he',\
                           'him', 'his', 'her', 'she', 'it', 'they', 'them', 'its', 'their','theirs',\
                           'you','your','yours','me','my','mine','I','we','us','much','and/or'
                           ]

stop_words = set([*nltk.corpus.stopwords.words('english'),
                *closed_class_stop_words])

In [48]:
#file_path = 'job_postings_salary.txt'  // Mansheel's file path
file_path = 'job_postings_salary.csv'
all_postings = pd.read_csv(file_path)

all_postings.head()

Unnamed: 0,combined_text,annual_salary,salary_range,salary_bin
0,Hearing Care Provider Overview\n\nHearingLife ...,63000.0,50k-100k,1
1,Cook descriptionTitle\n\n Looking for a great ...,46321.6,0-50k,0
2,Principal Cloud Security Architect (Remote) Jo...,240895.0,150k+,3
3,"Dishwasher descriptionTitle\n\n $2,000 Sign-on...",40144.0,0-50k,0
4,Insights Analyst - Auto Industry Who We Are\n\...,61000.0,50k-100k,1


In [49]:
def remove_emojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  
                           u"\U0001F300-\U0001F5FF"  
                           u"\U0001F680-\U0001F6FF"  
                           u"\U0001F700-\U0001F77F"  
                           u"\U0001F780-\U0001F7FF"  
                           u"\U0001F800-\U0001F8FF"  
                           u"\U0001F900-\U0001F9FF"  
                           u"\U0001FA00-\U0001FA6F"  
                           u"\U0001FA70-\U0001FAFF"  
                           u"\U00002702-\U000027B0"  
                           u"\U000024C2-\U0001F251" 
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [50]:
def preprocess_text(text):
    text_no_emojis = remove_emojis(text)
    tokens = word_tokenize(text_no_emojis)
    processed_tokens = [stemmer.stem(token.lower()) for token in tokens 
                        if token.lower() not in stop_words and token not in string.punctuation]
    return ' '.join(processed_tokens)

In [51]:
all_postings['processed_description'] = all_postings['combined_text'].apply(preprocess_text)

print(all_postings[['combined_text', 'processed_description']].head())


                                       combined_text  \
0  Hearing Care Provider Overview\n\nHearingLife ...   
1  Cook descriptionTitle\n\n Looking for a great ...   
2  Principal Cloud Security Architect (Remote) Jo...   
3  Dishwasher descriptionTitle\n\n $2,000 Sign-on...   
4  Insights Analyst - Auto Industry Who We Are\n\...   

                               processed_description  
0  hear care provid overview hearinglif nation he...  
1  cook descriptiontitl look great opportun devel...  
2  princip cloud secur architect remot job summar...  
3  dishwash descriptiontitl 2,000 sign-on bonus g...  
4  insight analyst auto industri escal award-win ...  


In [47]:
print(all_postings.shape)

(13350, 4)


In [52]:
# define target class and input text
target = all_postings['salary_bin'].astype('category')
text = all_postings['combined_text']

In [59]:
from sklearn.model_selection import train_test_split
# split data into 70% training, 10% validation and 20% testing set
X_train, X_test, y_train, y_test = train_test_split(text, target, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125)


In [60]:
tfidf = TfidfVectorizer()
tfidf_vectorizer = tfidf.fit(X_train)
X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
X_val_tfidf = tfidf_vectorizer.transform(X_val)


In [None]:
# print the top20 words with the highest tf-idf score in each salary bin
def print_top20_words(X, y, tfidf_vectorizer):
    for i, salary_bin in enumerate(y.cat.categories):
        X_salary_bin = X[y == salary_bin]
        X_salary_bin_tfidf = tfidf_vectorizer.transform(X_salary_bin)
        tfidf_scores = np.array(X_salary_bin_tfidf.mean(axis=0)).flatten()
        top20_word_indices = np.argsort(tfidf_scores)[::-1][:20]
        top20_words = np.array(tfidf_vectorizer.get_feature_names())[top20_word_indices]
        print(f'Top 20 words in salary bin {salary_bin}: {top20_words}')

print_top20_words(X_train, y_train, tfidf_vectorizer)

In [61]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# tune max depth, min samples split and min samples leaf, and n_estimators
max_depth = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
n_estimators = [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]

best_accuracy = 0
best_max_depth = None
best_min_samples_split = None
best_min_samples_leaf = None
best_n_estimators = None

for depth in max_depth:
    for split in min_samples_split:
        for leaf in min_samples_leaf:
            for n in n_estimators:
                rf = RandomForestClassifier(max_depth=depth, min_samples_split=split, min_samples_leaf=leaf, n_estimators=n)
                rf.fit(X_train_tfidf, y_train)
                y_val_pred = rf.predict(X_val_tfidf)
                accuracy = accuracy_score(y_val, y_val_pred)
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_max_depth = depth
                    best_min_samples_split = split
                    best_min_samples_leaf = leaf
                    best_n_estimators = n

print(f'Best accuracy: {best_accuracy}')
print(f'Best max depth: {best_max_depth}')
print(f'Best min samples split: {best_min_samples_split}')
print(f'Best min samples leaf: {best_min_samples_leaf}')
print(f'Best n estimators: {best_n_estimators}')



In [None]:

clf = RandomForestClassifier(n_estimators=100, random_state=0, max_depth=30)
clf.fit(X_train_tfidf, y_train)

y_pred = clf.predict(X_test_tfidf)
print('Accuracy: ', accuracy_score(y_test, y_pred))

