In [15]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
import nltk
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib


In [16]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jaswanth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
df = pd.read_csv("fake_job_postings.csv")
df = df[['title', 'location', 'department', 'salary_range', 'description', 'requirements', 'fraudulent']]
df.fillna('', inplace=True)

# Combine relevant text fields
df['text'] = df['title'] + ' ' + df['location'] + ' ' + df['department'] + ' ' + df['salary_range'] + ' ' + df['description'] + ' ' + df['requirements']


In [None]:
# 🧹 Step 5: Clean Text Function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)  # remove symbols
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

df['clean_text'] = df['text'].apply(clean_text)

# ✅ Step 6: Manually add suspicious fake examples
extra_fake = pd.DataFrame({
    'clean_text': [
        'earn money quickly no skills required register by paying 200',
        'work from home and earn 50000 weekly limited openings apply fast',
        'click the link to register and pay to start job from tomorrow',
        'form filling jobs with daily payment no interview',
        'get rich quick scheme work online no qualification'
    ],
    'fraudulent': [1, 1, 1, 1, 1]
})

df = pd.concat([df[['clean_text', 'fraudulent']], extra_fake], ignore_index=True)

# 🧪 Step 7: Features and Labels
X = df['clean_text']
y = df['fraudulent']

# 📊 Step 8: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_vec = vectorizer.fit_transform(X)

# 🔀 Step 9: Split Dataset
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# 🚀 Step 10: Train Model (Random Forest)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 📈 Step 11: Evaluate
y_pred = model.predict(X_test)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))

# 💾 Step 12: Save Model and Vectorizer
joblib.dump(model, 'fake_job_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')






In [None]:

def predict_job_post(text):
    # suspicious words
    suspicious_keywords = ['₹', 'earn', 'form filling', 'registration fee', 'click here', 'no experience', 'work from home', 'pay to start']

    # Clean input
    cleaned = ' '.join(text.lower().split())

    # Rule-based check
    if any(keyword in cleaned for keyword in suspicious_keywords):
        print("⚠️ Suspicious keywords found: High chance of being FAKE")

    # ML Prediction
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    print("🔴 FAKE Job Posting" if prediction == 1 else "🟢 Real Job Posting")

In [21]:
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))



✅ Accuracy: 0.9751188146491473

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99      3400
           1       0.99      0.50      0.67       177

    accuracy                           0.98      3577
   macro avg       0.98      0.75      0.83      3577
weighted avg       0.98      0.98      0.97      3577



In [22]:
def predict_job_post(text):
    # Optional: basic rule-based check
    suspicious_keywords = ['₹', 'earn', 'form filling', 'registration fee', 'click here', 'no experience', 'work from home', 'pay to start']

    # Clean input
    cleaned = ' '.join(text.lower().split())

    if any(keyword in cleaned for keyword in suspicious_keywords):
        print("⚠️ Suspicious keywords found: High chance of being FAKE")

    # Vectorize and Predict
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    
    # Output result
    if prediction == 1:
        print("🔴 FAKE Job Posting")
    else:
        print("🟢 Real Job Posting")

# 🔎 Example test:
predict_job_post("Earn ₹1 lakh/month working from home. No experience required. Just pay ₹200 to register.")



⚠️ Suspicious keywords found: High chance of being FAKE
🔴 FAKE Job Posting
