In [4]:
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')
file_path = '/content/drive/My Drive/Fake_Job_Posting Project/fake_job_postings.csv'
df = pd.read_csv(file_path)
df.head()
df.shape


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


(17880, 18)

In [5]:

missing_percentage = df.isnull().mean()*100
missing_percentage

Unnamed: 0,0
job_id,0.0
title,0.0
location,1.935123
department,64.580537
salary_range,83.959732
company_profile,18.501119
description,0.005593
requirements,15.0783
benefits,40.33557
telecommuting,0.0


In [7]:
none_fill_cols = [
    'department',
    'company_profile',
    'requirements',
    'benefits',
    'description'
]

for col in none_fill_cols:
    df[col] = df[col].fillna("None")

# Columns to fill with 'Unknown'
unknown_fill_cols = [
    'location',
    'salary_range',
    'employment_type',
    'required_experience',
    'required_education',
    'industry',
    'function'
]

for col in unknown_fill_cols:
    df[col] = df[col].fillna("Unknown")

df['benefits_missing'] = df['benefits'].eq("None").astype(int)
df['salary_range_missing'] = df['salary_range'].eq("Unknown").astype(int)


print(df.isnull().sum())

job_id                  0
title                   0
location                0
department              0
salary_range            0
company_profile         0
description             0
requirements            0
benefits                0
telecommuting           0
has_company_logo        0
has_questions           0
employment_type         0
required_experience     0
required_education      0
industry                0
function                0
fraudulent              0
benefits_missing        0
salary_range_missing    0
dtype: int64


In [8]:
print(df["fraudulent"].value_counts())

fraudulent
0    17014
1      866
Name: count, dtype: int64


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE


# Simple example with just description
tfidf = TfidfVectorizer(max_features=500)

X_text = tfidf.fit_transform(df["description"])

# Or if you have a cleaned column:
# X_text = tfidf.fit_transform(df["cleaned_description"])

# Convert sparse matrix to DataFrame
X_text_df = pd.DataFrame(X_text.toarray(), columns=tfidf.get_feature_names_out())

# Optional: add numeric features (e.g. missing flags)
numeric_features = df[["benefits_missing", "salary_range_missing"]]

# Combine TF-IDF + numeric features
X = pd.concat([X_text_df, numeric_features.reset_index(drop=True)], axis=1)

y = df["fraudulent"]


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training shape:", X_train.shape)
print("Test shape:", X_test.shape)

print(y_train.value_counts())

Training shape: (14304, 502)
Test shape: (3576, 502)
fraudulent
0    13611
1      693
Name: count, dtype: int64


In [11]:
smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(y_train_resampled.value_counts())


fraudulent
0    13611
1    13611
Name: count, dtype: int64


In [12]:
clf = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

clf.fit(X_train_resampled, y_train_resampled)

In [13]:
y_pred = clf.predict(X_test)

# Probability scores
y_prob = clf.predict_proba(X_test)[:, 1]


In [14]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3403
           1       0.95      0.60      0.73       173

    accuracy                           0.98      3576
   macro avg       0.96      0.80      0.86      3576
weighted avg       0.98      0.98      0.98      3576

[[3397    6]
 [  69  104]]
ROC-AUC: 0.9554023226700684


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Train
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_resampled, y_train_resampled)

# Predict probabilities
y_prob = clf.predict_proba(X_test)[:, 1]

# NEW: Adjusted threshold
threshold = 0.3

print(f"\n== Evaluation at threshold {threshold} ==")
y_pred_new = (y_prob >= threshold).astype(int)

print(classification_report(y_test, y_pred_new))
print(confusion_matrix(y_test, y_pred_new))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))



== Evaluation at threshold 0.3 ==
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      3403
           1       0.70      0.73      0.71       173

    accuracy                           0.97      3576
   macro avg       0.84      0.86      0.85      3576
weighted avg       0.97      0.97      0.97      3576

[[3348   55]
 [  47  126]]
ROC-AUC: 0.9554023226700684


In [None]:
''' # To save the model
from google.colab import drive
import os
import joblib


drive.mount('/content/drive')

joblib.dump(clf, "/content/drive/My Drive/Fake_Job_Posting Project/fake_job_detector_model.pkl")
joblib.dump(tfidf, "/content/drive/My Drive/Fake_Job_Posting Project/tfidf_vectorizer.pkl")
'''

Mounted at /content/drive


['/content/drive/My Drive/Fake_Job_Posting Project/tfidf_vectorizer.pkl']

In [20]:
from google.colab import drive
import joblib

drive.mount('/content/drive')


clf = joblib.load("/content/drive/My Drive/Fake_Job_Posting Project/fake_job_detector_model.pkl")
tfidf = joblib.load("/content/drive/My Drive/Fake_Job_Posting Project/tfidf_vectorizer.pkl")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
import pandas as pd

def predict_fraudulent_job(description, benefits_missing, salary_missing, model, vectorizer, threshold=0.3):
    # Handle missing description
    if not description:
        description = "None"

    # Transform description
    text_vec = vectorizer.transform([description])
    text_df = pd.DataFrame(text_vec.toarray(), columns=vectorizer.get_feature_names_out())

    # Combine with numeric features
    extra_features = pd.DataFrame([[benefits_missing, salary_missing]], columns=["benefits_missing", "salary_range_missing"])
    full_input = pd.concat([text_df, extra_features], axis=1)

    # Prediction
    prob = model.predict_proba(full_input)[0][1]
    prediction = int(prob >= threshold)

    return {
        "prediction": prediction,
        "probability": round(prob, 4),
        "verdict": "Fake" if prediction == 1 else "Genuine"
    }


In [22]:
# Using existing probabilities
y_pred_final = (y_prob >= 0.3).astype(int)

# Created DataFrame with predictions
df_results = pd.DataFrame(X_test.reset_index(drop=True))
df_results["actual"] = y_test.reset_index(drop=True)
df_results["predicted"] = y_pred_final
df_results["probability"] = y_prob


df_results

Unnamed: 0,12,200,30,ability,able,about,account,accounts,achieve,across,...,written,year,years,you,your,benefits_missing,salary_range_missing,actual,predicted,probability
0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.220292,0.083201,0.053888,1,1,0,0,0.110000
1,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.084106,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0,1,0,0,0.000000
2,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.043280,0.056063,1,1,0,0,0.060000
3,0.0,0.0,0.000000,0.070371,0.071117,0.000000,0.085505,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0,1,0,0,0.030000
4,0.0,0.0,0.000000,0.074498,0.075288,0.127824,0.000000,0.0,0.094594,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0,1,0,0,0.100000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3571,0.0,0.0,0.067168,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.055287,0.000000,0.000000,0.000000,0,1,0,1,0.602361
3572,0.0,0.0,0.000000,0.000000,0.163353,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,1,1,0,0,0.000000
3573,0.0,0.0,0.000000,0.000000,0.000000,0.054239,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.241705,0.000000,1,1,0,0,0.030000
3574,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.101292,0.131211,1,1,0,0,0.180000


In [None]:
'''
# summary DataFrame
df_summary = pd.DataFrame({
    "description": df.loc[X_test.index, "description"].values,
    "benefits_missing": X_test["benefits_missing"].values,
    "salary_range_missing": X_test["salary_range_missing"].values,
    "actual": y_test.values,
    "predicted": y_pred_new,
    "probability": y_prob
})

df_summary.to_csv("job_prediction_summary.csv", index=False)
df_summary.head()
'''

Unnamed: 0,description,benefits_missing,salary_range_missing,actual,predicted,probability
0,"Our client, located in Urban, IL, is looking f...",1,1,0,0,0.11
1,Other agencies may call this job “Project Mana...,0,1,0,0,0.0
2,Squiz is an Australian owned and now multinati...,1,1,0,0,0.06
3,The Regional Sales Director SA will help deriv...,0,1,0,0,0.03
4,About the CompanyThis is an amazing job opport...,0,1,0,0,0.1
