In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv(
    "/content/drive/MyDrive/Colab Notebooks/On-Going Projects/Fake vs Real Job/fake_job_postings.csv"
)

print(df.shape)
df.head()


use_cols = [
    "title",
    "description",
    "requirements",
    "company_profile",
    "fraudulent"
]

df = df[use_cols]
df.info()


text_cols = ["title", "description", "requirements", "company_profile"]

for col in text_cols:
    df[col] = df[col].fillna("")


df.isnull().sum()


def combine_text(row):
    return (
        "JOB_TITLE: " + row["title"] + " "
        "DESCRIPTION: " + row["description"] + " "
        "REQUIREMENTS: " + row["requirements"] + " "
        "COMPANY_PROFILE: " + row["company_profile"]
    )

df["combined_text"] = df.apply(combine_text, axis=1)


y = df["fraudulent"]
X = df["combined_text"]

y.value_counts(normalize=True)


X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


tfidf = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1, 2),
    min_df=3,
    stop_words="english"
)

X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)


model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    n_jobs=-1
)

model.fit(X_train_vec, y_train)


y_pred = model.predict(X_test_vec)
y_prob = model.predict_proba(X_test_vec)[:, 1]

print(classification_report(y_test, y_pred))


print("ROC-AUC:", roc_auc_score(y_test, y_prob))
confusion_matrix(y_test, y_pred)


import joblib

joblib.dump(model, "job_shield_model.joblib")
joblib.dump(tfidf, "job_shield_vectorizer.joblib")








import joblib

model = joblib.load("job_shield_model.joblib")
vectorizer = joblib.load("job_shield_vectorizer.joblib")

print("Model and vectorizer loaded successfully")


import pandas as pd

df = pd.read_csv(
    "/content/drive/MyDrive/Colab Notebooks/On-Going Projects/Fake vs Real Job/fake_job_postings.csv"
)

# Keep same columns
df = df[
    ["title", "description", "requirements", "company_profile", "fraudulent"]
]

# Fill missing values
for col in ["title", "description", "requirements", "company_profile"]:
    df[col] = df[col].fillna("")


def combine_text(row):
    return (
        "JOB_TITLE: " + row["title"] + " "
        "DESCRIPTION: " + row["description"] + " "
        "REQUIREMENTS: " + row["requirements"] + " "
        "COMPANY_PROFILE: " + row["company_profile"]
    )

df["combined_text"] = df.apply(combine_text, axis=1)


real_samples = df[df["fraudulent"] == 0].sample(5, random_state=42)

X_real = vectorizer.transform(real_samples["combined_text"])
real_probs = model.predict_proba(X_real)[:, 1]

for i, p in enumerate(real_probs):
    print(f"Real Job {i+1} → Fake probability: {p:.4f}")


fake_samples = df[df["fraudulent"] == 1].sample(5, random_state=42)

X_fake = vectorizer.transform(fake_samples["combined_text"])
fake_probs = model.predict_proba(X_fake)[:, 1]

for i, p in enumerate(fake_probs):
    print(f"Fake Job {i+1} → Fake probability: {p:.4f}")


df["fake_prob"] = model.predict_proba(
    vectorizer.transform(df["combined_text"])
)[:, 1]

# Jobs that confuse the model
borderline = df[(df["fake_prob"] > 0.45) & (df["fake_prob"] < 0.55)]

borderline[["title", "fake_prob", "fraudulent"]].head(10)


idx = fake_samples.index[0]

print("TITLE:", df.loc[idx, "title"])
print("\nPREDICTED FAKE PROB:", df.loc[idx, "fake_prob"])
print("\nACTUAL LABEL:", df.loc[idx, "fraudulent"])


