## 1. Import Libraries

In [7]:
import numpy as np
import pandas as pd

import sklearn

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

## 2. Train Test Split

In [8]:
path = "../Raw Data Files/final_job_posting.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,fraudulent,location_country,title_contain_urgent,have_company_profile,have_requirements,have_benefits,full_text,flesch_score_bin_ft,fk_grade_level_bin_ft,high_salary,category,text_len
0,0,1,0,other,internship,bachelors degree,0,US,0,1,1,0,marketing intern food fifty two weve created g...,7,1,0,advertising marketing,267
1,0,1,0,full time,not applicable,bachelors degree,0,NZ,0,1,1,1,customer service cloud video production ninety...,6,2,0,advertising customer marketing service success,567
2,0,1,0,unspecified,not applicable,high school or equivalent,0,US,0,1,1,0,commissioning machinery assistant cma valor se...,7,2,0,unspecified,235
3,0,1,0,full time,mid senior level,bachelors degree,0,US,0,1,1,1,account executive washington dc passion improv...,7,1,0,computer sale software,497
4,0,1,1,full time,mid senior level,bachelors degree,0,US,0,1,1,1,bill review manager spotsource solutions llc g...,7,1,0,care health hospital provider,370


In [4]:
# train test split
from sklearn.model_selection import train_test_split
X = df.drop('fraudulent',axis=1)
y = df['fraudulent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2023,stratify=y)

## 3. Pipeline

- embeddings: full_text
- frequency encoding: category
- one-hot encoding: employment_type, required_experience, required_education, location_country, 
- scale: flesch_score_bin_ft, fk_grade_level_bin_ft, text_len

### 3.1 Embedding

#### 3.1.1 Pre-trained RoBERTa

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

In [None]:
# in colab, select "Change runtime type," and choose "GPU" as the hardware accelerator
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
device

In [None]:
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
model = model.to(device)

In [None]:
# ~10 mins
# if don't use GPU, much slower!!
feature_list = []

with torch.no_grad():
    i=0
    for batch_idx in range(0,len(df),20):
      batch = df['full_text'][batch_idx:batch_idx+20]
      tokenized = tokenizer.batch_encode_plus(list(batch), add_special_tokens=True, max_length=512, padding='max_length', truncation=True, return_attention_mask=True)
      #check 20 sample each time
      input_ids = torch.tensor(tokenized['input_ids']).to(device)
      used_attention_mask = torch.tensor(tokenized['attention_mask']).to(device)
      last_hidden_states = model(input_ids, attention_mask=used_attention_mask)
      #Get the embeddings for the [CLS] tag (position is 0)
      features = last_hidden_states[0][:,0,:].cpu().numpy()
      feature_list.append(features)
      i = i+1
      if i % 100 == 0:
        print(i*20)

In [None]:
features = np.vstack(feature_list)
features.shape

### 3.2 Encoder

In [None]:
# label encoding v2 for the grades textstat
# from sklearn.preprocessing import LabelEncoder

# label_encoder = LabelEncoder()

# df['flesch_score_bin_ft_encoded'] = label_encoder.fit_transform(df['flesch_score_bin_ft'])
# df['fk_grade_level_bin_ft_encoded'] = label_encoder.fit_transform(df['fk_grade_level_bin_ft'])

# df['flesch_score_bin_cat_encoded'] = label_encoder.fit_transform(df['flesch_score_bin_cat'])
# df['fk_grade_level_bin_cat_encoded'] = label_encoder.fit_transform(df['fk_grade_level_bin_cat'])

# df.head()

Unnamed: 0,title,department,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,location_country,title_contain_urgent,have_company_profile,have_requirements,have_benefits,full_text,flesch_score_bin_ft,fk_grade_level_bin_ft,high_salary,category,flesch_score_cat,fk_grade_level_cat,text_len,flesch_score_bin_ft_encoded,fk_grade_level_bin_ft_encoded
0,Marketing Intern,Marketing,0,1,0,Other,Internship,,Marketing and Advertising,Marketing,0,US,0,1,1,0,marketing intern food fifty two weve created g...,7,1,0,Marketing and Advertising,7,5,267,6,0
1,Customer Service - Cloud Video Production,Success,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,NZ,0,1,1,1,customer service cloud video production ninety...,6,2,0,Success Marketing and Advertising Customer Ser...,7,2,567,5,1
2,Commissioning Machinery Assistant (CMA),unspecified,0,1,0,,,,unspecified,unspecified,0,US,0,1,1,0,commissioning machinery assistant cma valor se...,7,2,0,unspecified,7,1,235,6,1
3,Account Executive - Washington DC,Sales,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,US,0,1,1,1,account executive washington dc passion improv...,7,1,0,Sales Computer Software,6,3,497,6,0
4,Bill Review Manager,unspecified,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,US,0,1,1,1,bill review manager spotsource solutions llc g...,7,1,0,unspecified Hospital & Health Care Provider,6,3,370,6,0


In [13]:
# one-hot encoding
from sklearn.preprocessing import OneHotEncoder

onehot = OneHotEncoder(
    categories = [
            ['Full-time', 'Contract', 'Part-time','Temporary','Other','unavailable'], # employment_type
            ['unavailable', 'Mid-Senior level', 'Entry level', 'Associate', 'Not Applicable', 'Director', 'Internship', 'Executive'], # required_experience
            ['unavailable',"Bachelor's Degree",'High School or equivalent','Unspecified',"Master's Degree",'Associate Degree','Certification','Some College Coursework Completed','Vocational','Professional','Doctorate','Some High School Coursework','Vocational - Degree','Vocational - HS Diploma','Vocational - Other','None','Vocational - HS Certificate','Vocational - Bachelor','Vocational - Master'], # required_education
            # ['Information Technology and Services', 'Computer Software','Internet','Marketing and Advertising','Education Management','Financial Services','Hospital & Health Care','Consumer Services','Telecommunications','Oil & Energy','Retail','Real Estate','Accounting','Construction'], # Industry
            # ['unavailable','Information Technology','Sales','Engineering','Customer Service','Marketing','Administrative','Design','Health Care Provider','Other','Education','Management','Business Development','Accounting/Auditing','Human Resources','Project Management','Finance','Consulting','Writing/Editing','Art/Creative','Production','Quality Assurance','Data Analyst','Research','Manufacturing','Advertising','Public Relations','Legal','Distribution','Product Management','Strategy/Planning','Purchasing','General Business','Science','Supply Chain','Training','Financial Analyst','Biotech','Inventory','Information Technology','Facilities','QA','Skilled Labor','Business Development','Research Development','Science','Professional Services','Design','Product Management','Strategy/Planning','Purchasing','General Business','Supply Chain','Training','Financial Analyst','Biotech','Inventory','Facilities','QA','Skilled Labor','Research Development','Professional Services','Logistics','Nonprofit','Operations Management','Nurse','Pharmaceutical'], # function
            ['US','GB','GR','CA','DE'] # location_country
     ],
     handle_unknown = 'ignore',  # <- Ignore unknown values (i.e. don't create a column for them)
)

onehot.fit_transform(X_train[[ 'employment_type', 'required_experience','required_education','location_country']])

<18197x38 sparse matrix of type '<class 'numpy.float64'>'
	with 10347 stored elements in Compressed Sparse Row format>

In [15]:
pd.DataFrame.sparse.from_spmatrix(
    data = onehot.fit_transform(X_train[['employment_type', 'required_experience','required_education','location_country']]),  # <- sparse matrix
    columns = onehot.get_feature_names_out(),                                                                                  # <- encoded features
    index = X_train.index
)

Unnamed: 0,employment_type_Full-time,employment_type_Contract,employment_type_Part-time,employment_type_Temporary,employment_type_Other,employment_type_unavailable,required_experience_unavailable,required_experience_Mid-Senior level,required_experience_Entry level,required_experience_Associate,required_experience_Not Applicable,required_experience_Director,required_experience_Internship,required_experience_Executive,required_education_unavailable,required_education_Bachelor's Degree,required_education_High School or equivalent,required_education_Unspecified,required_education_Master's Degree,required_education_Associate Degree,required_education_Certification,required_education_Some College Coursework Completed,required_education_Vocational,required_education_Professional,required_education_Doctorate,required_education_Some High School Coursework,required_education_Vocational - Degree,required_education_Vocational - HS Diploma,required_education_Vocational - Other,required_education_None,required_education_Vocational - HS Certificate,required_education_Vocational - Bachelor,required_education_Vocational - Master,location_country_US,location_country_GB,location_country_GR,location_country_CA,location_country_DE
9990,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
23304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
15597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5331,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3.3 Scaler

### 3.4 Model

In [16]:
# Combine them!
binary_columns = ['telecommuting', 'has_company_logo','has_questions', 'title_contain_urgent', 'have_company_profile', 'have_requirements', 'have_benefits','high_salary']
numerical_columns = ['flesch_score_bin_ft','fk_grade_level_bin_ft', 'text_len']
onehot_columns = ['employment_type', 'required_experience','required_education','location_country']

from sklearn.compose import ColumnTransformer
encoder = ColumnTransformer([
    # (nickname, transformer to apply, columns to apply to)
    ('binary', 'passthrough', binary_columns),
    ('numerical', 'passthrough', numerical_columns),   # <- 'passthrough' says to keep them but don't apply anything
    # ('ordinal', ordinal, ordinal_columns),           # <- apply ordinal encoder to the ordinal_columns
    ('onehot', onehot, onehot_columns)                 # <- apply onehot encoder to the onehot_columns
])