# Overview

In [1]:
from csv import DictReader
from importlib import reload

RAND_SEED=123

with open("./job_postings_training_set.csv") as fin:
    reader = DictReader(fin)
    
    dataset = list(reader)


In [2]:
from utils import size_of
    
# print(len(dataset), dataset[0])
n_samples = len(dataset)
n_positive_samples = size_of(_ for _ in dataset if _['fraudulent'] == '1')
print(f"#sample = {n_samples}, #(fraudulent=1)={n_positive_samples} ({n_positive_samples/n_samples:.2%})")

#sample = 17828, #(fraudulent=1)=862 (4.84%)


## Low-hanging fruit

Let's see if any feature has high correlation to the predition.

In [8]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from itertools import chain


import utils
reload(utils)

interested_fields =  ['telecommuting', 'has_company_logo', 'has_questions', 'required_experience', 'required_education', 'employment_type', 'industry', 'function']


input_enc = OneHotEncoder()

categorical_dataset: list[list[str]] = [
    [d[fn] for fn in interested_fields]
    for d in dataset 
]

X = input_enc.fit_transform(categorical_dataset).toarray()


feature_names:dict[str, str] = dict(
    chain.from_iterable(
        ((f"{fn}__{index}", f"{fn}: {cat}") for index,cat in enumerate(category)) for fn, category in zip(interested_fields, input_enc.categories_)
    )
)

print(f"{feature_names=}")


target_enc = OrdinalEncoder(categories=[['0','1']])
target_set: list[list[str]] = [   [d['fraudulent']] for d in dataset ]

Y = target_enc.fit_transform(target_set)

print(f"{X.shape=}, {Y.shape=}")


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=RAND_SEED)

print(f"{X_train.shape=} {y_train.shape=} | {X_test.shape=} {y_test.shape=}")




feature_names={'telecommuting__0': 'telecommuting: 0', 'telecommuting__1': 'telecommuting: 1', 'has_company_logo__0': 'has_company_logo: 0', 'has_company_logo__1': 'has_company_logo: 1', 'has_questions__0': 'has_questions: 0', 'has_questions__1': 'has_questions: 1', 'required_experience__0': 'required_experience: ', 'required_experience__1': 'required_experience: Associate', 'required_experience__2': 'required_experience: Director', 'required_experience__3': 'required_experience: Entry level', 'required_experience__4': 'required_experience: Executive', 'required_experience__5': 'required_experience: Internship', 'required_experience__6': 'required_experience: Mid-Senior level', 'required_experience__7': 'required_experience: Not Applicable', 'required_education__0': 'required_education: ', 'required_education__1': 'required_education: Associate Degree', 'required_education__2': "required_education: Bachelor's Degree", 'required_education__3': 'required_education: Certification', 'requi

In [9]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.utils.validation import column_or_1d
from lightgbm import LGBMClassifier

import utils
reload(utils)

def solve(reg, **kwargs):
    reg.fit(X_train, column_or_1d(y_train), **kwargs)

    y_pred = reg.predict(X_test)
    res = utils.acc_and_f1(y_test, y_pred)

    print(f"{res!s} {reg!s}")

    return reg



solve(LogisticRegressionCV(cv=3, random_state=RAND_SEED, refit=True, solver='newton-cholesky'))
solve(LogisticRegressionCV(cv=3, random_state=RAND_SEED, refit=True, solver='liblinear'))
best_lr = solve(LogisticRegressionCV(cv=3, random_state=RAND_SEED, refit=True, solver='liblinear', class_weight='balanced'))
solve(LogisticRegressionCV(cv=3, random_state=RAND_SEED, refit=True, solver='liblinear', class_weight={0:1, 1:100}))
best = solve(LGBMClassifier(random_state=RAND_SEED), feature_name=list(feature_names.keys()))






Result(acc=0.9553, f1=0.2866, precision=0.7273, recall=0.1784) LogisticRegressionCV(cv=3, random_state=123, solver='newton-cholesky')
Result(acc=0.9553, f1=0.2866, precision=0.7273, recall=0.1784) LogisticRegressionCV(cv=3, random_state=123, solver='liblinear')
Result(acc=0.8048, f1=0.2927, precision=0.1790, recall=0.8030) LogisticRegressionCV(class_weight='balanced', cv=3, random_state=123,
                     solver='liblinear')
Result(acc=0.6250, f1=0.1970, precision=0.1104, recall=0.9145) LogisticRegressionCV(class_weight={0: 1, 1: 100}, cv=3, random_state=123,
                     solver='liblinear')
Result(acc=0.9729, f1=0.6742, precision=0.8523, recall=0.5576) LGBMClassifier(random_state=123)


In [10]:
#
feature_importances = sorted(zip(best.feature_importances_, best.feature_name_), reverse=True)

for importance, name in feature_importances[:15]:
    print( importance, feature_names[name], )    

104 required_education: High School or equivalent
102 has_questions: 0
101 has_company_logo: 0
98 required_experience: Entry level
91 required_education: 
86 employment_type: Full-time
84 industry: Oil & Energy
71 required_experience: Mid-Senior level
67 required_experience: 
64 required_education: Bachelor's Degree
62 function: Engineering
59 required_experience: Not Applicable
59 employment_type: Part-time
55 function: Administrative
53 function: Sales


In [11]:
coef = best_lr.coef_.ravel()
best_feature_index = coef.ravel().argsort()[::-1]


names= list(feature_names.values())
for index in best_feature_index[:15]:
    print(f"{coef[index]:.04f} {names[index]}")

5.3360 industry: Oil & Energy
5.2076 industry: Media Production
4.7393 industry: Animation
4.6438 industry: Computer Networking
4.2183 industry: Hospitality
4.1158 industry: Design
4.0570 industry: Leisure, Travel & Tourism
4.0001 industry: Accounting
3.9713 function: Distribution
3.7581 industry: Computer & Network Security
3.5490 industry: Biotechnology
3.5076 industry: Health, Wellness and Fitness
3.5034 industry: Hospital & Health Care
3.4176 function: Business Development
3.3749 industry: Information Services


# Deep learning method

In [38]:
import torch
from torch import nn 
from torch.utils.data import DataLoader, TensorDataset

torch.manual_seed(RAND_SEED)

# Get cpu, gpu or mps device for training.
device = ( "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self, hsize=512):
        super().__init__()
        # self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(204, hsize),
            nn.ReLU(),
            nn.Linear(hsize, hsize),
            nn.ReLU(),
            nn.Linear(hsize, 2)
        )

    def forward(self, x):
        # x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits



train_dataloader = DataLoader(TensorDataset(torch.FloatTensor(X_train), torch.LongTensor(y_train.ravel())), batch_size=64, shuffle=True)
test_dataloader = DataLoader(TensorDataset(torch.FloatTensor(X_test), torch.LongTensor(y_test.ravel())), batch_size=640000000)



def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # print(X, y)

        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    loss, current = loss.item(), (batch + 1) * len(X)
    print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


from torcheval.metrics.functional import (binary_f1_score, binary_accuracy, binary_precision, binary_recall)
from utils import Result 

def test(dataloader, model, loss_fn, name=""):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)

            pred_max_indices = pred.argmax(1)
            test_loss += loss_fn(pred, y).item()
            # correct += (pred_max_indices == y).type(torch.float).sum().item()

            result = Result(
                binary_accuracy(pred_max_indices, y).item(),
                binary_f1_score(pred_max_indices, y).item(),
                binary_precision(pred_max_indices, y).item(),
                binary_recall(pred_max_indices, y).item(),
            )
    test_loss /= num_batches
    # correct /= size

    print(f"Test Error: Avg loss {test_loss:>8f}\n{result} {name}\n")


model = NeuralNetwork(256).to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


def run(loss_fn, epochs=7, name=""):

    for t in range(epochs):
        print(f"Epoch {t+1}\n")
        train(train_dataloader, model, loss_fn, optimizer)
        test(test_dataloader, model, loss_fn, name or repr(loss_fn))


Using cuda device
NeuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=204, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=2, bias=True)
  )
)


In [41]:
run(nn.CrossEntropyLoss(), name = "CrossEntropyLoss")

Epoch 1

loss: 0.041175  [12285/12479]
Test Error: Avg loss 0.093124
Result(acc=0.9748, f1=0.6966, precision=0.8807, recall=0.5762) CrossEntropyLoss

Epoch 2

loss: 0.031357  [12285/12479]
Test Error: Avg loss 0.103372
Result(acc=0.9746, f1=0.6881, precision=0.8982, recall=0.5576) CrossEntropyLoss

Epoch 3

loss: 0.047696  [12285/12479]
Test Error: Avg loss 0.097866
Result(acc=0.9731, f1=0.6786, precision=0.8492, recall=0.5651) CrossEntropyLoss

Epoch 4

loss: 0.114220  [12285/12479]
Test Error: Avg loss 0.107139
Result(acc=0.9727, f1=0.6894, precision=0.8060, recall=0.6022) CrossEntropyLoss

Epoch 5

loss: 0.044064  [12285/12479]
Test Error: Avg loss 0.102599
Result(acc=0.9744, f1=0.6949, precision=0.8667, recall=0.5799) CrossEntropyLoss

Epoch 6

loss: 0.032117  [12285/12479]
Test Error: Avg loss 0.100742
Result(acc=0.9736, f1=0.6928, precision=0.8368, recall=0.5911) CrossEntropyLoss

Epoch 7

loss: 0.013574  [12285/12479]
Test Error: Avg loss 0.102879
Result(acc=0.9748, f1=0.7020, p

In [35]:
run(nn.CrossEntropyLoss(weight=torch.Tensor([1, 20]).to(device)), name="CrossEntropyLoss(weighed)")

Epoch 1
-------------------------------
loss: 0.267659  [12285/12479]
Test Error: Avg loss 0.366129
Result(acc=0.9123, f1=0.4874, precision=0.3452, recall=0.8290) CrossEntropyLoss(weighed)

Epoch 2
-------------------------------
loss: 0.107637  [12285/12479]
Test Error: Avg loss 0.430486
Result(acc=0.9303, f1=0.5423, precision=0.4048, recall=0.8216) CrossEntropyLoss(weighed)

Epoch 3
-------------------------------
loss: 0.933977  [12285/12479]
Test Error: Avg loss 0.606592
Result(acc=0.9518, f1=0.6055, precision=0.5143, recall=0.7361) CrossEntropyLoss(weighed)

Epoch 4
-------------------------------
loss: 0.288457  [12285/12479]
Test Error: Avg loss 0.445586
Result(acc=0.8815, f1=0.4215, precision=0.2793, recall=0.8587) CrossEntropyLoss(weighed)

Epoch 5
-------------------------------
loss: 0.074084  [12285/12479]
Test Error: Avg loss 0.506686
Result(acc=0.9306, f1=0.5437, precision=0.4062, recall=0.8216) CrossEntropyLoss(weighed)

Epoch 6
-------------------------------
loss: 0.18

In [40]:
run(nn.CrossEntropyLoss(label_smoothing =0.05), name="CrossEntropyLoss(label_smoothing)")

Epoch 1

loss: 0.169387  [12285/12479]
Test Error: Avg loss 0.180617
Result(acc=0.9740, f1=0.7036, precision=0.8250, recall=0.6134) CrossEntropyLoss(label_smoothing)

Epoch 2

loss: 0.149342  [12285/12479]
Test Error: Avg loss 0.183062
Result(acc=0.9733, f1=0.6772, precision=0.8621, recall=0.5576) CrossEntropyLoss(label_smoothing)

Epoch 3

loss: 0.178648  [12285/12479]
Test Error: Avg loss 0.182981
Result(acc=0.9723, f1=0.6636, precision=0.8538, recall=0.5428) CrossEntropyLoss(label_smoothing)

Epoch 4

loss: 0.226543  [12285/12479]
Test Error: Avg loss 0.178986
Result(acc=0.9740, f1=0.6918, precision=0.8571, recall=0.5799) CrossEntropyLoss(label_smoothing)

Epoch 5

loss: 0.145692  [12285/12479]
Test Error: Avg loss 0.180950
Result(acc=0.9738, f1=0.6943, precision=0.8413, recall=0.5911) CrossEntropyLoss(label_smoothing)

Epoch 6

loss: 0.173156  [12285/12479]
Test Error: Avg loss 0.184273
Result(acc=0.9710, f1=0.6764, precision=0.7714, recall=0.6022) CrossEntropyLoss(label_smoothing)