In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../../data/interim/cleaned_fake_job_dataset.csv')

print(df.shape)
df.head()

(17589, 17)


Unnamed: 0,title,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,country,state,city
0,Marketing Intern,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,No benefits listed,0,1,0,Other,Internship,Unknown,Unknown,Marketing,0,US,NY,New York
1,Customer Service - Cloud Video Production,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,Unknown,Marketing and Advertising,Customer Service,0,NZ,Unknown,Auckland
2,Commissioning Machinery Assistant (CMA),Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,No benefits listed,0,1,0,Unknown,Unknown,Unknown,Unknown,Unknown,0,US,IA,Wever
3,Account Executive - Washington DC,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,US,DC,Washington
4,Bill Review Manager,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,US,FL,Fort Worth


In [3]:
df.isnull().sum()

title                  0
company_profile        0
description            0
requirements           0
benefits               0
telecommuting          0
has_company_logo       0
has_questions          0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
fraudulent             0
country                0
state                  0
city                   0
dtype: int64

In [4]:
df['fraudulent'].value_counts()

fraudulent
0    16734
1      855
Name: count, dtype: int64

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine relevant text columns
df['combined_text'] = (
    df['title'] + ' ' +
    df['company_profile'] + ' ' +
    df['description'] + ' ' +
    df['requirements'] + ' ' +
    df['benefits']
)


In [6]:
df.nunique()

title                  11231
company_profile         1710
description            14801
requirements           11968
benefits                6205
telecommuting              2
has_company_logo           2
has_questions              2
employment_type            6
required_experience        8
required_education        14
industry                 132
function                  38
fraudulent                 2
country                   91
state                    325
city                    2388
combined_text          16065
dtype: int64

In [7]:
categorical_columns = ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country', 'state', 'city']
for col in categorical_columns:
    print()
    print(df[col].value_counts())


employment_type
Full-time    11414
Unknown       3427
Contract      1514
Part-time      772
Temporary      237
Other          225
Name: count, dtype: int64

required_experience
Unknown             6961
Mid-Senior level    3768
Entry level         2631
Associate           2265
Not Applicable      1070
Director             382
Internship           372
Executive            140
Name: count, dtype: int64

required_education
Unknown                              8008
Bachelor's Degree                    5096
High School or equivalent            1987
Unspecified                          1366
Master's Degree                       416
Associate Degree                      263
Certification                         165
Some College Coursework Completed     100
Professional                           73
Vocational                             47
Some High School Coursework            27
Doctorate                              26
Vocational - HS Diploma                 9
Vocational - Degree           

In [8]:
df['industry'].value_counts()

industry
Unknown                                4836
Information Technology and Services    1708
Computer Software                      1362
Internet                               1054
Education Management                    819
                                       ... 
Sporting Goods                            1
Museums and Institutions                  1
Shipbuilding                              1
Alternative Dispute Resolution            1
Ranching                                  1
Name: count, Length: 132, dtype: int64

In [9]:
# Step 1: Group rare categories
industry_counts = df['industry'].value_counts()
rare_industries = industry_counts[industry_counts < 50].index
df['industry_grouped'] = df['industry'].replace(rare_industries, 'Other')

In [10]:
industry_pivot = (
    df.groupby('industry_grouped')['fraudulent']
      .agg(['count', 'sum', 'mean'])  # count=total postings, sum=# fraud, mean=fraud rate
      .sort_values('mean', ascending=False)
      .reset_index()
)

industry_pivot.rename(columns={
    'count': 'total_jobs',
    'sum': 'fraud_jobs',
    'mean': 'fraud_rate'
}, inplace=True)

print(industry_pivot)


                       industry_grouped  total_jobs  fraud_jobs  fraud_rate
0                          Oil & Energy         286         108    0.377622
1                            Accounting         159          57    0.358491
2             Leisure, Travel & Tourism          72          20    0.277778
3                           Hospitality          87          14    0.160920
4                           Real Estate         166          23    0.138554
5          Health, Wellness and Fitness         123          15    0.121951
6                Hospital & Health Care         483          49    0.101449
7                    Telecommunications         324          26    0.080247
8                     Consumer Services         347          24    0.069164
9                         Entertainment          73           5    0.068493
10              Staffing and Recruiting         127           8    0.062992
11                      Human Resources         102           6    0.058824
12     Trans

In [11]:
df['function'].value_counts()

function
Unknown                   6362
Information Technology    1730
Sales                     1446
Engineering               1339
Customer Service          1176
Marketing                  815
Administrative             613
Design                     336
Health Care Provider       326
Other                      325
Education                  325
Management                 306
Business Development       226
Accounting/Auditing        210
Human Resources            201
Project Management         183
Finance                    165
Consulting                 138
Writing/Editing            131
Art/Creative               131
Production                 115
Product Management         113
Quality Assurance          110
Advertising                 90
Business Analyst            83
Data Analyst                82
Public Relations            76
Manufacturing               73
General Business            68
Research                    50
Strategy/Planning           46
Legal                       44

In [12]:
function_counts = df['function'].value_counts()
rare_functions = function_counts[function_counts < 50].index
df['function_grouped'] = df['function'].replace(rare_functions, 'Other')

In [13]:
df['country'].value_counts()

country
US    10451
GB     2329
GR      938
CA      450
DE      382
      ...  
HR        1
SV        1
JM        1
KZ        1
KH        1
Name: count, Length: 91, dtype: int64

In [14]:
country_counts = df['country'].value_counts()
rare_countries = country_counts[country_counts < 50].index
df['country_grouped'] = df['country'].replace(rare_countries, 'Other')

In [15]:
df['state'].value_counts()

state
Unknown    2540
CA         2010
NY         1228
LND         991
TX          955
           ... 
RM            1
AA            1
OV            1
AN            1
NLE           1
Name: count, Length: 325, dtype: int64

In [16]:
state_counts = df['state'].value_counts()
rare_states = state_counts[state_counts < 10].index
df['state_grouped'] = df['state'].replace(rare_states, 'Other')

In [17]:
df['city'].value_counts()

city
Unknown                 2051
London                  1052
New York                 655
Athens                   540
San Francisco            471
                        ... 
Central NJ Preferred       1
TULSA                      1
Kalivia Thorikou           1
Arcadia                    1
DRAPER                     1
Name: count, Length: 2388, dtype: int64

In [18]:
city_counts = df['city'].value_counts()
rare_cities = city_counts[city_counts < 10].index
df['city_grouped'] = df['city'].replace(rare_cities, 'Other')

In [19]:
df.head()

Unnamed: 0,title,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,...,fraudulent,country,state,city,combined_text,industry_grouped,function_grouped,country_grouped,state_grouped,city_grouped
0,Marketing Intern,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,No benefits listed,0,1,0,Other,Internship,...,0,US,NY,New York,"Marketing Intern We're Food52, and we've creat...",Unknown,Marketing,US,NY,New York
1,Customer Service - Cloud Video Production,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,...,0,NZ,Unknown,Auckland,Customer Service - Cloud Video Production 90 S...,Marketing and Advertising,Customer Service,NZ,Unknown,Auckland
2,Commissioning Machinery Assistant (CMA),Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,No benefits listed,0,1,0,Unknown,Unknown,...,0,US,IA,Wever,Commissioning Machinery Assistant (CMA) Valor ...,Unknown,Unknown,US,IA,Other
3,Account Executive - Washington DC,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,...,0,US,DC,Washington,Account Executive - Washington DC Our passion ...,Computer Software,Sales,US,DC,Washington
4,Bill Review Manager,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,...,0,US,FL,Fort Worth,Bill Review Manager SpotSource Solutions LLC i...,Hospital & Health Care,Health Care Provider,US,FL,Fort Worth


In [20]:
df.columns

Index(['title', 'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent', 'country', 'state', 'city', 'combined_text',
       'industry_grouped', 'function_grouped', 'country_grouped',
       'state_grouped', 'city_grouped'],
      dtype='object')

In [21]:
# save data to optimize further augmentation
df.to_csv('../../data/processed/preprocessed_job_dataset.csv', index=False)

In [22]:
df_meta = df[['telecommuting','has_company_logo', 'has_questions', 'employment_type', 'required_experience', 'required_education', 'industry_grouped', 'function_grouped', 'country_grouped', 'state_grouped', 'city_grouped']]
df_meta.head()

Unnamed: 0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry_grouped,function_grouped,country_grouped,state_grouped,city_grouped
0,0,1,0,Other,Internship,Unknown,Unknown,Marketing,US,NY,New York
1,0,1,0,Full-time,Not Applicable,Unknown,Marketing and Advertising,Customer Service,NZ,Unknown,Auckland
2,0,1,0,Unknown,Unknown,Unknown,Unknown,Unknown,US,IA,Other
3,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,US,DC,Washington
4,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,US,FL,Fort Worth


In [23]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Custom order for ordinal features
experience_order = [
    'Internship',
    'Entry level',
    'Associate',
    'Mid-Senior level',
    'Director',
    'Executive',
    'Not Applicable',
    'Unknown'
]

education_order = [
    'Some High School Coursework',
    'High School or equivalent',
    'Vocational - HS Diploma',
    'Some College Coursework Completed',
    'Associate Degree',
    'Vocational',
    'Vocational - Degree',
    'Certification',
    "Bachelor's Degree",
    "Master's Degree",
    'Professional',
    'Doctorate',
    'Unspecified',
    'Unknown'
]

preprocessor = ColumnTransformer(
    transformers=[
        # Ordinal encoding for ordered columns
        ('ord', OrdinalEncoder(categories=[experience_order, education_order]), 
         ['required_experience', 'required_education']),
        
        # One-hot encoding for nominal columns
        ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False),
         ['employment_type','industry_grouped','function_grouped', 'country_grouped', 'state_grouped', 'city_grouped'])
    ],
    remainder='passthrough'  # Keep other features 
)


In [24]:
y = df['fraudulent']
X_meta = preprocessor.fit_transform(df_meta)

In [25]:
import re
from bs4 import BeautifulSoup
import html

def clean_text(text):
    t = BeautifulSoup(text, "html.parser").get_text()
    t = html.unescape(t)
    t = re.sub(r"#URL_[^#]+#", "", t)
    t = re.sub(r"\s+", " ", t).strip()
    
    return t

df['cleaned_text'] = df['combined_text'].apply(lambda x: clean_text(x))

In [26]:
# Vectorize Using TFIDF
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
X_text_tfidf = tfidf.fit_transform(df['cleaned_text'])

In [27]:
from scipy.sparse import hstack

X_final = hstack([X_text_tfidf, X_meta])  

### Modelling

In [28]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42, stratify=y)

In [30]:
clf_lr = LogisticRegression(class_weight='balanced', max_iter=1000)

clf_lr.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [31]:
y_pred = clf_lr.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98      3347
           1       0.57      0.91      0.70       171

    accuracy                           0.96      3518
   macro avg       0.78      0.94      0.84      3518
weighted avg       0.97      0.96      0.97      3518



In [32]:
probs = clf_lr.predict_proba(X_test)[:, 1]

for t in [0.5, 0.6, 0.7, 0.8]:
    preds = (probs >= t).astype(int)
    p, r, f1, _ = precision_recall_fscore_support(y_test, preds, average='binary')
    print(f"Threshold: {t:.2f} → Precision: {p:.2f}, Recall: {r:.2f}, F1: {f1:.2f}")


Threshold: 0.50 → Precision: 0.57, Recall: 0.91, F1: 0.70
Threshold: 0.60 → Precision: 0.66, Recall: 0.90, F1: 0.76
Threshold: 0.70 → Precision: 0.74, Recall: 0.87, F1: 0.80
Threshold: 0.80 → Precision: 0.83, Recall: 0.79, F1: 0.81


In [33]:
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train, y_train)

y_pred = clf_rf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3347
           1       0.99      0.62      0.76       171

    accuracy                           0.98      3518
   macro avg       0.99      0.81      0.88      3518
weighted avg       0.98      0.98      0.98      3518



In [34]:
clf_sgd = SGDClassifier()
clf_sgd.fit(X_train, y_train)

y_pred = clf_sgd.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3347
           1       0.75      0.82      0.78       171

    accuracy                           0.98      3518
   macro avg       0.87      0.90      0.89      3518
weighted avg       0.98      0.98      0.98      3518



In [35]:
ratio = 16734/855
ratio

19.571929824561405

In [36]:
clf_xgb = XGBClassifier(scale_pos_weight=ratio)
clf_xgb.fit(X_train, y_train)

y_pred = clf_xgb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3347
           1       0.86      0.81      0.83       171

    accuracy                           0.98      3518
   macro avg       0.92      0.90      0.91      3518
weighted avg       0.98      0.98      0.98      3518



## Track Experiments using MLFlow

In [37]:
models = [
    (
        "Logistic Regression", 
        LogisticRegression(class_weight='balanced', max_iter=1000), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "Logistic Regression with Threshold Tuning", 
        LogisticRegression(class_weight='balanced', max_iter=1000), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "Random Forest", 
        RandomForestClassifier(), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "SGDClassifier",
        SGDClassifier(), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "XGBClassifier",
        XGBClassifier(scale_pos_weight=16734/855), 
        (X_train, y_train),
        (X_test, y_test)
    )
]

In [38]:
reports = []

for model_name, model, train_set, test_set in models:
    X_train = train_set[0]
    y_train = train_set[1]
    X_test = test_set[0]
    y_test = test_set[1]
    
    model.fit(X_train, y_train)
    
    if model_name == "Logistic Regression with Threshold Tuning":
        probs = model.predict_proba(X_test)[:, 1]
        y_pred = (probs >= 0.8).astype(int)
    else:
        y_pred = model.predict(X_test)
        
    report = classification_report(y_test, y_pred, output_dict=True)
    reports.append(report)

In [39]:
reports[0]

{'0': {'precision': 0.995072374499538,
  'recall': 0.9653420974006574,
  'f1-score': 0.9799818016378526,
  'support': 3347.0},
 '1': {'precision': 0.5719557195571956,
  'recall': 0.9064327485380117,
  'f1-score': 0.7013574660633484,
  'support': 171.0},
 'accuracy': 0.9624786810687891,
 'macro avg': {'precision': 0.7835140470283668,
  'recall': 0.9358874229693346,
  'f1-score': 0.8406696338506006,
  'support': 3518.0},
 'weighted avg': {'precision': 0.9745058742166669,
  'recall': 0.9624786810687891,
  'f1-score': 0.9664386630979891,
  'support': 3518.0}}

In [40]:
import mlflow
import mlflow.sklearn
import mlflow.xgboost

In [43]:
# Initialize MLflow

from mlflow.tracking import MlflowClient

client = MlflowClient()
experiment = client.get_experiment_by_name("Job Classifier")

mlflow.set_experiment("Fake Job Detection")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

for i, element in enumerate(models):
    model_name = element[0]
    model = element[1]
    report = reports[i]
    
    with mlflow.start_run(run_name=model_name):        
        mlflow.log_param("model", model_name)
        mlflow.log_metric('accuracy', report['accuracy'])
        mlflow.log_metric('recall_class_1', report['1']['recall'])
        mlflow.log_metric('recall_class_0', report['0']['recall'])
        mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score'])  
        mlflow.log_metric('precision_class_1', report['1']['precision'])
        mlflow.log_metric('precision_class_0', report['0']['precision'])
        mlflow.log_metric('f1_score_class_1', report['1']['f1-score'])
        mlflow.log_metric('f1_score_class_0', report['0']['f1-score'])
        mlflow.log_metric('precision_macro', report['macro avg']['precision'])
        mlflow.log_metric('recall_macro', report['macro avg']['recall'])

        
        if "XGB" in model_name:
            mlflow.xgboost.log_model(model, "model")
        elif "CatBoost" in model_name:
            mlflow.catboost.log_model(model, "model")
        else:
            mlflow.sklearn.log_model(model, "model")  



🏃 View run Logistic Regression at: http://127.0.0.1:5000/#/experiments/218890209698209928/runs/b4eb788d0418415aabcf7f6cf2ec0633
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/218890209698209928




🏃 View run Logistic Regression with Threshold Tuning at: http://127.0.0.1:5000/#/experiments/218890209698209928/runs/80cae8ce93f54760bf57941fb5ff13a5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/218890209698209928




🏃 View run Random Forest at: http://127.0.0.1:5000/#/experiments/218890209698209928/runs/8dc5b8d16cb641ccb4461e785c3fee42
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/218890209698209928




🏃 View run SGDClassifier at: http://127.0.0.1:5000/#/experiments/218890209698209928/runs/405c9f4d4b81464198ba26a68acc6716
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/218890209698209928


  self.get_booster().save_model(fname)


🏃 View run XGBClassifier at: http://127.0.0.1:5000/#/experiments/218890209698209928/runs/357a2bead50c45bcac583b6d44f1e5c5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/218890209698209928


In [45]:
model_name = 'XGB-weight-scale'
run_id=input('Please type RunID')
model_uri = f"runs:/{run_id}/model"

mlflow.register_model(model_uri=model_uri, name=model_name)

Please type RunID 357a2bead50c45bcac583b6d44f1e5c5


Registered model 'XGB-weight-scale' already exists. Creating a new version of this model...
2025/08/31 16:12:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGB-weight-scale, version 3
Created version '3' of model 'XGB-weight-scale'.


<ModelVersion: aliases=[], creation_timestamp=1756631540297, current_stage='None', deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='', last_updated_timestamp=1756631540297, metrics=None, model_id=None, name='XGB-weight-scale', params=None, run_id='357a2bead50c45bcac583b6d44f1e5c5', run_link='', source='models:/m-5ce2d0cfbfb2477798d9159caa696114', status='READY', status_message=None, tags={}, user_id='', version='3'>