## 1. Import Libraries

In [99]:
import numpy as np
import pandas as pd

import sklearn
from sklearn.base import BaseEstimator,TransformerMixin

!python -m spacy download en_core_web_md
import spacy

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
     -                                      17.8/587.7 MB 18.7 MB/s eta 0:00:31


ERROR: Could not install packages due to an OSError: [Errno 28] No space left on device


[notice] A new release of pip is available: 23.0.1 -> 23.3
[notice] To update, run: python.exe -m pip install --upgrade pip


## 2. Train Test Split

In [15]:
path = "../Raw Data Files/final_job_posting.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,fraudulent,location_country,title_contain_urgent,have_company_profile,have_requirements,have_benefits,full_text,flesch_score_bin_ft,fk_grade_level_bin_ft,high_salary,category,text_len
0,0,1,0,other,internship,bachelors degree,0,US,0,1,1,0,marketing intern food fifty two weve created g...,7,1,0,advertising marketing,267
1,0,1,0,full time,not applicable,bachelors degree,0,NZ,0,1,1,1,customer service cloud video production ninety...,6,2,0,advertising customer marketing service success,567
2,0,1,0,unspecified,not applicable,high school or equivalent,0,US,0,1,1,0,commissioning machinery assistant cma valor se...,7,2,0,unspecified,235
3,0,1,0,full time,mid senior level,bachelors degree,0,US,0,1,1,1,account executive washington dc passion improv...,7,1,0,computer sale software,497
4,0,1,1,full time,mid senior level,bachelors degree,0,US,0,1,1,1,bill review manager spotsource solutions llc g...,7,1,0,care health hospital provider,370


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23925 entries, 0 to 23924
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   telecommuting          23925 non-null  int64 
 1   has_company_logo       23925 non-null  int64 
 2   has_questions          23925 non-null  int64 
 3   employment_type        23925 non-null  object
 4   required_experience    23925 non-null  object
 5   required_education     23925 non-null  object
 6   fraudulent             23925 non-null  int64 
 7   location_country       23925 non-null  object
 8   title_contain_urgent   23925 non-null  int64 
 9   have_company_profile   23925 non-null  int64 
 10  have_requirements      23925 non-null  int64 
 11  have_benefits          23925 non-null  int64 
 12  full_text              23925 non-null  object
 13  flesch_score_bin_ft    23925 non-null  int64 
 14  fk_grade_level_bin_ft  23925 non-null  int64 
 15  high_salary        

In [16]:
# train test split
from sklearn.model_selection import train_test_split
X = df.drop('fraudulent',axis=1)
y = df['fraudulent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2023,stratify=y)

## 3. Local Preprocessor

- embeddings: full_text
- frequency encoding: category
- one-hot encoding: employment_type, required_experience, required_education, location_country, 
- scale: flesch_score_bin_ft, fk_grade_level_bin_ft, text_len

### 3.1 Text Embedding

In [17]:
texts = df['full_text']

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

lemmatizer, stemmer = WordNetLemmatizer(), PorterStemmer()

def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemma(text):
    pos_tagged = nltk.pos_tag(nltk.word_tokenize(text))  
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
    words = []
    for word, tag in wordnet_tagged:
        if tag is None:
            # if there is no available tag, append the token as is
            words.append(word)
        else:        
            # else use the tag to lemmatize the token
            words.append(lemmatizer.lemmatize(word,tag))
    return " ".join(words)

def stem(text):
    words = word_tokenize(text)
    words = [stemmer.stem(word) for word in words]
    return " ".join(words)

texts_stem = texts.map(stem)
texts_lemma = texts.map(lemma)
texts_lemma_stem = texts.map(lemma).map(stem)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
# texts.to_csv('../Raw Data Files/texts.csv', index=False)
# texts_stem.to_csv('../Raw Data Files/texts_stem.csv', index=False)
# texts_lemma.to_csv('../Raw Data Files/texts_lemma.csv', index=False)
# texts_lemma_stem.to_csv('../Raw Data Files/texts_lemma_stem.csv', index=False)

#### 3.1.1 Pre-trained RoBERTa

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

In [None]:
# in colab, select "Change runtime type," and choose "GPU" as the hardware accelerator
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
device

In [None]:
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
model = model.to(device)

In [None]:
# ~10 mins
# if don't use GPU, much slower!!
feature_list = []

with torch.no_grad():
    i=0
    for batch_idx in range(0,len(df),20):
      batch = texts[batch_idx:batch_idx+20]
      tokenized = tokenizer.batch_encode_plus(list(batch), add_special_tokens=True, max_length=512, padding='max_length', truncation=True, return_attention_mask=True)
      #check 20 sample each time
      input_ids = torch.tensor(tokenized['input_ids']).to(device)
      used_attention_mask = torch.tensor(tokenized['attention_mask']).to(device)
      last_hidden_states = model(input_ids, attention_mask=used_attention_mask)
      #Get the embeddings for the [CLS] tag (position is 0)
      features = last_hidden_states[0][:,0,:].cpu().numpy()
      feature_list.append(features)
      i = i+1
      if i % 100 == 0:
        print(i*20)

In [None]:
features = np.vstack(feature_list)
features.shape

#### 3.1.2 SpaCy

In [102]:
class SpacyEmbeddings(TransformerMixin,BaseEstimator):
    def __init__(self, model="en_core_web_md"):
        self.model = model

    def fit(self,X,y=None):
        return self

    def transform(self,X):
        nlp = spacy.load(self.model)
        return np.concatenate([nlp(doc).vector.reshape(1,-1) for doc in X])
    
transformer = SpacyEmbeddings()
embeddings = transformer.fit_transform(texts.full_text)
print(embeddings.shape)

(23925, 300)


### 3.2 Categorical Encoders

In [83]:
# label encoding v2 for the grades textstat
# from sklearn.preprocessing import LabelEncoder

# label_encoder = LabelEncoder()

# df['flesch_score_bin_ft_encoded'] = label_encoder.fit_transform(df['flesch_score_bin_ft'])
# df['fk_grade_level_bin_ft_encoded'] = label_encoder.fit_transform(df['fk_grade_level_bin_ft'])

# df['flesch_score_bin_cat_encoded'] = label_encoder.fit_transform(df['flesch_score_bin_cat'])
# df['fk_grade_level_bin_cat_encoded'] = label_encoder.fit_transform(df['fk_grade_level_bin_cat'])

# df.head()

In [28]:
# one-hot encoding
from sklearn.preprocessing import OneHotEncoder

onehot = OneHotEncoder(
    categories = [
            ['full time', 'contract', 'part time','temporary','other','unspecified'], # employment_type
            ['unspecified', 'mid senior level', 'entry level', 'associate', 'not applicable', 'director', 'internship', 'executive'], # required_experience
            ['unspecified', "bachelors degree", 'high school or equivalent', "masters degree", 'associate degree', 'certification', 'some college coursework completed','vocational','professional','doctorate','some high school coursework','vocational degree', 'vocational hs diploma', 'vocational other', 'vocational hs certificate', 'vocational bachelor', 'vocational master'], # required_education
            # ['Information Technology and Services', 'Computer Software','Internet','Marketing and Advertising','Education Management','Financial Services','Hospital & Health Care','Consumer Services','Telecommunications','Oil & Energy','Retail','Real Estate','Accounting','Construction'], # Industry
            # ['unavailable','Information Technology','Sales','Engineering','Customer Service','Marketing','Administrative','Design','Health Care Provider','Other','Education','Management','Business Development','Accounting/Auditing','Human Resources','Project Management','Finance','Consulting','Writing/Editing','Art/Creative','Production','Quality Assurance','Data Analyst','Research','Manufacturing','Advertising','Public Relations','Legal','Distribution','Product Management','Strategy/Planning','Purchasing','General Business','Science','Supply Chain','Training','Financial Analyst','Biotech','Inventory','Information Technology','Facilities','QA','Skilled Labor','Business Development','Research Development','Science','Professional Services','Design','Product Management','Strategy/Planning','Purchasing','General Business','Supply Chain','Training','Financial Analyst','Biotech','Inventory','Facilities','QA','Skilled Labor','Research Development','Professional Services','Logistics','Nonprofit','Operations Management','Nurse','Pharmaceutical'], # function
            ['US','GB','GR','CA','DE'] # location_country
     ],
     handle_unknown = 'ignore',  # <- Ignore unknown values (i.e. don't create a column for them)
)

onehot.fit_transform(X_train[['employment_type', 'required_experience','required_education','location_country']])

<16747x36 sparse matrix of type '<class 'numpy.float64'>'
	with 52192 stored elements in Compressed Sparse Row format>

In [29]:
pd.DataFrame.sparse.from_spmatrix(
    data = onehot.fit_transform(X_train[['employment_type', 'required_experience','required_education','location_country']]),  # <- sparse matrix
    columns = onehot.get_feature_names_out(),                                                                                  # <- encoded features
    index = X_train.index
)

Unnamed: 0,employment_type_full time,employment_type_contract,employment_type_part time,employment_type_temporary,employment_type_other,employment_type_unspecified,required_experience_unspecified,required_experience_mid senior level,required_experience_entry level,required_experience_associate,required_experience_not applicable,required_experience_director,required_experience_internship,required_experience_executive,required_education_unspecified,required_education_bachelors degree,required_education_high school or equivalent,required_education_masters degree,required_education_associate degree,required_education_certification,required_education_some college coursework completed,required_education_vocational,required_education_professional,required_education_doctorate,required_education_some high school coursework,required_education_vocational degree,required_education_vocational hs diploma,required_education_vocational other,required_education_vocational hs certificate,required_education_vocational bachelor,required_education_vocational master,location_country_US,location_country_GB,location_country_GR,location_country_CA,location_country_DE
5847,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1689,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19100,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13117,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
16752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7815,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6840,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### 3.3 Numerical Scaler

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
# Combine them!
binary_columns = ['telecommuting', 'has_company_logo','has_questions', 'title_contain_urgent', 'have_company_profile', 'have_requirements', 'have_benefits','high_salary']
numerical_columns = ['flesch_score_bin_ft','fk_grade_level_bin_ft', 'text_len']
onehot_columns = ['employment_type', 'required_experience','required_education','location_country']
text_columns = ['full_text']

from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer([
    # (nickname, transformer to apply, columns to apply to)
    ('binary', 'passthrough', binary_columns),
    ('numerical', scaler, numerical_columns),   # <- 'passthrough' says to keep them but don't apply anything
    # ('ordinal', ordinal, ordinal_columns),           # <- apply ordinal encoder to the ordinal_columns
    ('onehot', onehot, onehot_columns)                 # <- apply onehot encoder to the onehot_columns
    ('text', 'passthrough', text_columns)
])

## 4. Model

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

In [None]:
def train(x_train,y_train,model,params):
    gridsearchcv = GridSearchCV(model, params, cv=3, scoring='f1_weighted', verbose=True, n_jobs=-1)
    gridsearchcv.fit(x_train, y_train.values.ravel())
    best_model = model.set_params(**gridsearchcv.best_params_).fit(x_train, y_train.values.ravel())
    return best_model

model = Pipeline(steps=[
    ('preprocess', preprocessor),
    # ("scale", ColumnTransformer(transformers=[('standard_scaler',StandardScaler(),num_col)],remainder='passthrough')),
    # ('resample', SMOTEENN(random_state=random_state,enn=EditedNearestNeighbours(sampling_strategy='majority'))),
    ('train', LogisticRegression(random_state=random_state,class_weight='balanced')),
])

params = dict(
    training__C       = [0.001,0.01,0.1,1,10],
    training__solver  = ['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga'],
)

In [None]:
def evaluate(x_test,y_test,name,model,preprocessor):

    print(model,"\n")
    
    y_pred = model.predict(preprocessor.transform(x_test))

    scores = {}
    scores["Accuracy"]  = accuracy_score(y_test,y_pred).round(3)
    scores["Precision"] = precision_score(y_test,y_pred,average="weighted").round(3)
    scores["Recall"]    = recall_score(y_test,y_pred,average="weighted").round(3)
    scores["F1"]        = f1_score(y_test,y_pred,average="weighted").round(3)
    scores["ROC AUC"]   = roc_auc_score(y_test,y_pred,average="weighted").round(3)
    scores = {name:scores}
    print(pd.DataFrame(scores))

    cm = confusion_matrix(y_test,y_pred)
    cm_plot = sns.heatmap(cm, annot=True, fmt='g', cmap='Blues_r')
    cm_plot.set_xlabel('Predicted Values')
    cm_plot.set_ylabel('Actual Values')
    plt.show()

name = "Logistic Regression"
evaluate(X_test,y_test,best_model['train'],best_model["preprocess"])