## 1. Import Libraries

In [24]:
import numpy as np
import pandas as pd
import seaborn as sns

import sklearn
from category_encoders import CountEncoder
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

!python -m spacy download en_core_web_md
import spacy

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

random_state = 4012

Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
     --------------------------------------- 42.8/42.8 MB 27.3 MB/s eta 0:00:00


  hasattr(torch, "has_mps")
  and torch.has_mps  # type: ignore[attr-defined]
2023-11-13 11:16:03.128385: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2023-11-13 11:16:03.129130: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-11-13 11:16:10.498258: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2023-11-13 11:16:10.500984: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cublas64_11.dll'; dlerror: cublas64_11.dll not found
2023-11-13 11:16:10.503678: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cublasLt64_11.dll'; dlerror: cublasLt64_11.dll not found
2023-11-13 11:16:10.538159: W tensorflow/stream_

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


## 2. Train Test Split

In [2]:
path = "../Raw Data Files/final_job_posting.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,fraudulent,location_country,have_company_profile,have_requirements,have_benefits,full_text,have_category,high_salary,flesch_score_bin_ft,fk_grade_level_bin_ft,text_len
0,0,1,0,other,entry level,unspecified,0,US,1,1,0,marketing intern food fifty two weve created g...,1,0,7,1,269
1,0,1,0,full time,unspecified,unspecified,0,NZ,1,1,1,customer service cloud video production ninety...,1,0,6,2,572
2,0,1,0,contract,senior level,unspecified,0,US,1,1,0,commissioning machinery assistant cma valor se...,0,0,7,2,235
3,0,1,0,full time,middle level,undergraduate,0,US,1,1,1,account executive washington dc passion improv...,1,0,7,1,501
4,0,1,1,full time,middle level,undergraduate,0,US,1,1,1,bill review manager spotsource solutions llc g...,1,0,7,1,376


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23925 entries, 0 to 23924
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   telecommuting          23925 non-null  int64 
 1   has_company_logo       23925 non-null  int64 
 2   has_questions          23925 non-null  int64 
 3   employment_type        23925 non-null  object
 4   required_experience    23925 non-null  object
 5   required_education     23925 non-null  object
 6   fraudulent             23925 non-null  int64 
 7   location_country       23925 non-null  object
 8   have_company_profile   23925 non-null  int64 
 9   have_requirements      23925 non-null  int64 
 10  have_benefits          23925 non-null  int64 
 11  full_text              23925 non-null  object
 12  have_category          23925 non-null  int64 
 13  high_salary            23925 non-null  int64 
 14  flesch_score_bin_ft    23925 non-null  int64 
 15  fk_grade_level_bin_

In [28]:
embeddings = pd.read_csv('../Processed Data Files/out.csv')
new = pd.concat([df.reset_index(drop=True).drop(columns=['full_text']),embeddings.reset_index(drop=True)], axis=1)

# train test split
X = new.drop('fraudulent',axis=1)
y = new['fraudulent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state,stratify=y)

In [30]:
# save files
X_train.to_csv('../Processed Data Files/X_train.csv',index=False)
X_test.to_csv('../Processed Data Files/X_test.csv',index=False)
y_train.to_csv('../Processed Data Files/y_train.csv',index=False)
y_test.to_csv('../Processed Data Files/y_test.csv',index=False)

## 3. Local Preprocessor

- embeddings: full_text
- frequency encoding: category
- one-hot encoding: employment_type, required_experience, required_education, location_country, 
- scale: flesch_score_bin_ft, fk_grade_level_bin_ft, text_len

### 3.1 Text Embedding

#### 3.1.1 Pre-trained RoBERTa

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

In [None]:
# in colab, select "Change runtime type," and choose "GPU" as the hardware accelerator
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
device

In [None]:
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
model = model.to(device)

In [None]:
# ~10 mins
# if don't use GPU, much slower!!
feature_list = []

with torch.no_grad():
    i=0
    for batch_idx in range(0,len(df),20):
      batch = texts[batch_idx:batch_idx+20]
      tokenized = tokenizer.batch_encode_plus(list(batch), add_special_tokens=True, max_length=512, padding='max_length', truncation=True, return_attention_mask=True)
      #check 20 sample each time
      input_ids = torch.tensor(tokenized['input_ids']).to(device)
      used_attention_mask = torch.tensor(tokenized['attention_mask']).to(device)
      last_hidden_states = model(input_ids, attention_mask=used_attention_mask)
      #Get the embeddings for the [CLS] tag (position is 0)
      features = last_hidden_states[0][:,0,:].cpu().numpy()
      feature_list.append(features)
      i = i+1
      if i % 100 == 0:
        print(i*20)

In [None]:
features = np.vstack(feature_list)
features.shape

#### 3.1.2 SpaCy

In [None]:
class SpacyEmbeddings(TransformerMixin,BaseEstimator):
    def __init__(self, model="en_core_web_md"):
        self.model = model

    def fit(self,X,y=None):
        return self

    def transform(self,X):
        nlp = spacy.load(self.model)
        return np.concatenate([nlp(doc).vector.reshape(1,-1) for doc in X])

In [None]:
transformer = SpacyEmbeddings()
embeddings = transformer.fit_transform(X_train.full_text)
print(embeddings.shape)

### 3.2 One Hot Encoding

In [6]:
onehot = OneHotEncoder(
    categories = [
            ['full time', 'contract', 'part time','flexi','other','unspecified'], # employment_type
            ['entry level', 'middle level', 'senior level', 'unspecified'], # required_experience
            ['high school or vocational degree', 'undergraduate', 'graduate', 'unspecified'], # required_education
    ],
    handle_unknown = 'ignore',  # <- Ignore unknown values (i.e. don't create a column for them)
)

In [7]:
# try
pd.DataFrame.sparse.from_spmatrix(
    data = onehot.fit_transform(X_train[['employment_type', 'required_experience','required_education']]),  # <- sparse matrix
    columns = onehot.get_feature_names_out(),                                                               # <- encoded features
    index = X_train.index
)

Unnamed: 0,employment_type_full time,employment_type_contract,employment_type_part time,employment_type_flexi,employment_type_other,employment_type_unspecified,required_experience_entry level,required_experience_middle level,required_experience_senior level,required_experience_unspecified,required_education_high school or vocational degree,required_education_undergraduate,required_education_graduate,required_education_unspecified
11718,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4652,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9647,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
11916,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
21552,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3135,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
9141,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4625,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7653,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


### 3.3 Frequency Encoding

In [25]:
freq_encoder = CountEncoder()

In [26]:
# try
pd.DataFrame(
    data = freq_encoder.fit_transform(X_train['location_country']),
    columns = ['location_country']
)

Unnamed: 0,location_country
11718,6233
4652,6233
9647,138
11916,6233
21552,2250
...,...
3135,6233
9141,265
4625,6233
7653,6233


### 3.4 Numeric Scaling

In [10]:
scaler = StandardScaler()

In [11]:
# try
pd.DataFrame(
    data = scaler.fit_transform(X_train[['flesch_score_bin_ft','fk_grade_level_bin_ft', 'text_len']]),
    columns = ['flesch_score_bin_ft','fk_grade_level_bin_ft', 'text_len']
)

Unnamed: 0,flesch_score_bin_ft,fk_grade_level_bin_ft,text_len
0,-3.319066,0.591816,-0.360478
1,0.718542,-0.233150,1.249379
2,-3.319066,0.591816,1.746971
3,0.718542,-0.233150,-0.360478
4,-0.627328,-0.233150,-1.187358
...,...,...,...
16742,-0.627328,-0.233150,0.642024
16743,-0.627328,-0.233150,-0.528781
16744,0.718542,-1.058116,-0.199492
16745,0.718542,-0.233150,-0.558051
