In [None]:
!pip install -q pandas numpy scikit-learn matplotlib seaborn joblib

In [None]:
!wget -O /content/fake_job_postings.csv "https://www.kaggle.com/datasets/srisaisuhassanisetty/fake-job-postings"

In [None]:
from google.colab import files
files.upload()

In [None]:
!unzip "archive (1).zip"

In [None]:
import pandas as pd
df = pd.read_csv('/content/fake_job_postings.csv')
print("Shape of dataset:", df.shape)
df.head()

In [None]:
# justfor checking columns
df.columns

In [None]:
df['text'] = (
    df.get('title', '').fillna('') + ' ' +
    df.get('company_profile', '').fillna('') + ' ' +
    df.get('description', '').fillna('') + ' ' +
    df.get('requirements', '').fillna('') + ' ' +
    df.get('benefits', '').fillna('')
)
df['text'].head()

In [None]:
import re
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['text'] = df['text'].apply(clean_text)
df['text'].head()#to see the difference from before and it is very clearly visible

In [None]:
# Features (input)
X = df['text']

# Target label
y = df['fraudulent']
print(y.value_counts())

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english'
)

X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

X_train_vec.shape, X_test_vec.shape

In [None]:
from sklearn.svm import LinearSVC

svm_model = LinearSVC(max_iter=5000,class_weight = 'balanced')
svm_model.fit(X_train_vec, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

y_pred = svm_model.predict(X_test_vec)

print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
def predict_job(text):
    cleaned = clean_text(text)
    vec = tfidf.transform([cleaned])
    pred = svm_model.predict(vec)[0]
    return "Fake Job" if pred == 1 else "Real Job"

print(predict_job("Earn â‚¹5000/day from home, no experience needed, click now"))

In [None]:
import joblib, os

os.makedirs("model_files", exist_ok=True)

joblib.dump(svm_model, "model_files/svm_model.pkl")
joblib.dump(tfidf, "model_files/tfidf.pkl")

In [None]:
from google.colab import drive
drive.mount("/content/drive")

!mkdir -p /content/drive/MyDrive/fake_job_model
!cp model_files/* /content/drive/MyDrive/fake_job_model/

In [None]:
!cp /content/drive/MyDrive/fake_job_model/svm_model.pkl /content/
!cp /content/drive/MyDrive/fake_job_model/tfidf.pkl /content/

In [None]:
%%writefile predict.py
import joblib, re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-zA-Z ]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# NOTE: user will load .pkl locally
def load_model():
    svm_model = joblib.load("svm_model.pkl")
    tfidf = joblib.load("tfidf.pkl")
    return svm_model, tfidf

def predict(text, model, vectorizer):
    cleaned = clean_text(text)
    vec = vectorizer.transform([cleaned])
    return "Fake Job" if model.predict(vec)[0] == 1 else "Real Job"

In [None]:
from google.colab import files
files.download("predict.py")