In [None]:
import pandas as pd

# Load data
df = pd.read_csv("fake_job_postings.csv")
df.head()


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


PHASE 2: Data Preprocessing

In [None]:
# Drop ID
df.drop(columns=['job_id'], inplace=True)


In [None]:
# Text columns
text_cols = [
    'title', 'company_profile', 'description',
    'requirements', 'benefits'
]

df[text_cols] = df[text_cols].fillna('')


In [None]:
# Categorical columns
cat_cols = [
    'location', 'department', 'employment_type',
    'required_experience', 'required_education',
    'industry', 'function'
]

df[cat_cols] = df[cat_cols].fillna('Unknown')


In [None]:
# Combine text
df['combined_text'] = df['title'] + ' ' + df['company_profile'] + ' ' + df['description'] + ' ' + df['requirements'] + ' ' + df['benefits']


Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,combined_text
0,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,Unknown,Unknown,Marketing,0,"Marketing Intern We're Food52, and we've creat..."
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,Unknown,Marketing and Advertising,Customer Service,0,Customer Service - Cloud Video Production 90 S...
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",Unknown,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,Unknown,Unknown,Unknown,Unknown,Unknown,0,Commissioning Machinery Assistant (CMA) Valor ...
3,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,Account Executive - Washington DC Our passion ...
4,Bill Review Manager,"US, FL, Fort Worth",Unknown,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,Bill Review Manager SpotSource Solutions LLC i...


PHASE 3: Feature Engineering

In [61]:
X_text = df['combined_text']
X_meta = df[['telecommuting', 'has_company_logo', 'has_questions']]
y = df['fraudulent']


In [None]:
#splitting
from sklearn.model_selection import train_test_split

X_text_train, X_text_test, X_meta_train, X_meta_test, y_train, y_test = train_test_split(
    X_text, X_meta, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


PHASE 4: NLP Processing (Core ML Part)

In [None]:
# Improved TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=7000,
    ngram_range=(1, 2)
)

X_text_train_tfidf = tfidf.fit_transform(X_text_train)
X_text_test_tfidf = tfidf.transform(X_text_test)


In [64]:
from scipy.sparse import hstack

X_train_final = hstack([X_text_train_tfidf, X_meta_train])
X_test_final = hstack([X_text_test_tfidf, X_meta_test])



PHASE 5: Model Training

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression( # lbfgs, optimiser by default, signmoid activation function by default as well
    max_iter=2000,
    C=0.5,
    solver='liblinear',
    class_weight={0: 1, 1: 2.5}
)
model.fit(X_train_final, y_train)
    

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.5
,fit_intercept,True
,intercept_scaling,1
,class_weight,"{0: 1, 1: 2.5}"
,random_state,
,solver,'liblinear'
,max_iter,2000


PHASE 6: Evaluation (CRITICAL)

In [74]:
# Evaluation with threshold tuning
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

y_prob = model.predict_proba(X_test_final)[:, 1]

# Precision-focused threshold
THRESHOLD = 0.40

y_pred = (y_prob >= THRESHOLD).astype(int)


# Metrics
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


Confusion Matrix:
[[3373   30]
 [  52  121]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3403
           1       0.80      0.70      0.75       173

    accuracy                           0.98      3576
   macro avg       0.89      0.85      0.87      3576
weighted avg       0.98      0.98      0.98      3576

ROC-AUC: 0.9835201513795206


PHASE 7: Model Saving (for Web App)

In [76]:
import joblib

joblib.dump(model, "job_fraud_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']