# Fake Job Posting Prediction Pipeline

This notebook demonstrates an end-to-end pipeline for detecting fake job postings using HuggingFace Transformers and AutoML techniques.

## 1. Setup and Dependencies

First, let's install the necessary packages.

In [None]:
!pip install transformers datasets pandas scikit-learn torch kagglehub tpot[sklearnex]



## 2. Data Ingestion

Download the job postings dataset from Kaggle and load it into a pandas DataFrame.

In [None]:
import tpot
import kagglehub
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from kagglehub import KaggleDatasetAdapter
from sklearn.model_selection import train_test_split

# Load the dataset
try:
  df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS, #   KaggleDatasetAdapter.HUGGING_FACE,
    "shivamb/real-or-fake-fake-jobposting-prediction",
    "fake_job_postings.csv"
  )
except Exception as e:
  df = None
  print(f"Error loading dataset: {e}")
  path = kagglehub.dataset_download("shivamb/real-or-fake-fake-jobposting-prediction")
  # Try different encodings
  encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
  for encoding in encodings:
    try:
      df = pd.read_csv(path + "\\fake_job_postings.csv", encoding=encoding)
      print(f"Successfully loaded with {encoding} encoding")
      break
    except UnicodeDecodeError:
      continue
    except Exception as e:
      print(f"Error with {encoding} encoding: {e}")
      continue

if df is None:
  df = pd.read_csv("fake_job_postings.csv")

df.columns







### Data Cleaning

In [7]:
# Handle missing values
print(f"Missing values before cleaning:\n{df.isnull().sum()}\n")

# Fill missing text fields with empty strings
text_columns = ['title', 'location', 'department', 'company_profile', 'description', 
               'requirements', 'benefits', 'employment_type', 'required_experience', 
               'required_education', 'industry', 'function']
for col in text_columns:
    if col in df.columns:
        df[col] = df[col].fillna('')

# Remove duplicates
df_size_before = len(df)
df = df.drop_duplicates()
print(f"Removed {df_size_before - len(df)} duplicate entries")

# Create a consolidated text field for modeling
df['text'] = df['title'] + ' ' + df['location'] + ' ' + df['description'] + ' ' + \
            df['requirements'] + ' ' + df['benefits'] + ' ' + df['company_profile']

# Basic text cleaning
import re
from bs4 import BeautifulSoup

def clean_text(text):
    """Clean text by removing HTML, special chars, and extra whitespace"""
    # Remove HTML
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove special characters
    text = re.sub(r'[^\w\s]', ' ', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply text cleaning to 100 samples to test (full cleaning in feature engineering)
df.loc[:100, 'text'] = df.loc[:100, 'text'].apply(clean_text)

# Convert target to numeric
df['fraudulent'] = df['fraudulent'].astype(int)

print(f"Missing values after cleaning:\n{df.isnull().sum()}\n")
print(f"Dataset shape: {df.shape}")
print(f"Fraud distribution:\n{df['fraudulent'].value_counts(normalize=True)}")



### Data Exploration

In [8]:
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['fraudulent'], random_state=42)

# Basic statistics and class distribution
print(f"Training set shape: {df_train.shape}")
print(f"Test set shape: {df_test.shape}")
print("\nClass distribution:")
print(f"Training: {df_train['fraudulent'].value_counts(normalize=True)}")
print(f"Testing: {df_test['fraudulent'].value_counts(normalize=True)}")

# Visualization of class distribution
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
df['fraudulent'].value_counts().plot(kind='bar', title='Count of Job Postings by Class')
plt.ylabel('Count')
plt.xlabel('Fraudulent (1) vs Real (0)')

plt.subplot(1, 2, 2)
df['fraudulent'].value_counts(normalize=True).plot(kind='pie', 
                                                   autopct='%1.1f%%',
                                                   labels=['Real', 'Fake'],
                                                   title='Percentage of Real vs Fake Job Postings')
plt.tight_layout()
plt.show()

# Analysis of text length vs fraud
df['text_length'] = df['text'].str.len()
plt.figure(figsize=(10, 6))
plt.hist([df[df['fraudulent']==0]['text_length'], df[df['fraudulent']==1]['text_length']], 
         bins=50, alpha=0.7, label=['Real', 'Fake'])
plt.legend()
plt.title('Distribution of Text Length by Job Posting Type')
plt.xlabel('Text Length')
plt.ylabel('Count')
plt.show()

# Analyze job requirements by fraud status
plt.figure(figsize=(10, 6))
req_length = df.groupby('fraudulent')['requirements'].apply(lambda x: x.str.len().mean())
req_length.plot(kind='bar', title='Average Length of Requirements by Job Type')
plt.xlabel('Fraudulent (1) vs Real (0)')
plt.ylabel('Avg Length (characters)')
plt.xticks([0, 1], ['Real', 'Fake'])
plt.show()

# Top locations for real vs fake jobs
plt.figure(figsize=(12, 6))
top_real_locs = df[df['fraudulent']==0]['location'].value_counts().head(10)
top_fake_locs = df[df['fraudulent']==1]['location'].value_counts().head(10)

print("Top locations for real job postings:")
print(top_real_locs)
print("\nTop locations for fake job postings:")
print(top_fake_locs)













In [None]:
"""
## NOTE: TODO & TEST this Implementation.

# Train the Model using TPOT
from tpot import TPOTClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

model = TPOTClassifier(verbosity=2, generations=5, population_size=20, random_state=42, config_dict='TPOT sparse')
model.fit(df_train['text'], df_train['fraudulent'])

print(f"Best pipeline accuracy: {accuracy_score(df_test['fraudulent'], model.predict(df_test['text']))}")
print(classification_report(df_test['fraudulent'], model.predict(df_test['text'])))
"""