In [4]:
#  Data Preparation Libraries
import pandas as pd               # For data loading and manipulation (CSV, DataFrame)
import numpy as np                # For numerical operations and arrays
import re                         # For regular expressions (text cleaning)
import string                     # For text cleaning (punctuation removal)
import warnings
warnings.filterwarnings('ignore')

# Text Preprocessing and Feature Extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer  # For text vectorization
from sklearn.preprocessing import LabelEncoder, OneHotEncoder                 # For encoding categorical variables
from sklearn.model_selection import train_test_split                          # For splitting data into train/test
from sklearn.impute import SimpleImputer                                      # For handling missing values
from sklearn.utils import shuffle                                             # For shuffling the dataset

# Natural Language Processing (for advanced text cleaning)
import nltk                             # Natural Language Toolkit (tokenization, stopwords)
from nltk.corpus import stopwords       # For removing stopwords
from nltk.stem import PorterStemmer     # For stemming words
from nltk.tokenize import word_tokenize # For tokenization

# Evaluation and Visualization 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, classification_report
import matplotlib.pyplot as plt         # For plotting loss curves, confusion matrix, etc.
import seaborn as sns                   # For better visualizations

print("✅ All libraries imported successfully!")
print("📊 Ready to work with the real Fake Job Postings dataset!")

✅ All libraries imported successfully!
📊 Ready to work with the real Fake Job Postings dataset!


#                                                          LOADING THE DATASET 

In [68]:
def load_dataset(file_path='fake_job_postings.csv'): 
    try:
        df=pd.read_csv(file_path)
        print("✅ Real dataset loaded successfully!")
        
        print(f"📊 Dataset shape: {df.shape}")
        
        print(f"\n📋 All columns in the dataset:")
        for i,col in enumerate (df.columns, 1):
            print(f"{i:2d}.{col}")

        print(f"\n🎯 Target variable distribution:")
        if 'fraudulent' in df.columns:
            target_dist=df['fraudulent'].value_counts()
        print(f"  Real Jobs(0): {target_dist[0]:,} ({target_dist[0]/len(df)*100:.1f}%)")
        print(f"  Fake Jobs(1): {target_dist[1]:,} ({target_dist[1]/len(df)*100:.1f}%)")


        print(f"\n❓ Missing values analysis: ")
        
    
    except FileNotFoundError:
        print("❌ Error: Dataset file not found!")
        print("📥 Please download the dataset from:")
        print("   https://www.kaggle.com/datasets/shivamb/real-or-fake-fake-jobposting-prediction")
        print("   Save it as 'fake_job_postings.csv' in your working directory")
        return None
        
    except Exception as e:
        print(f"❌ Error loading dataset: {str(e)}")
        return None

df=load_dataset()

✅ Real dataset loaded successfully!
📊 Dataset shape: (17880, 18)

📋 All columns in the dataset:
 1.job_id
 2.title
 3.location
 4.department
 5.salary_range
 6.company_profile
 7.description
 8.requirements
 9.benefits
10.telecommuting
11.has_company_logo
12.has_questions
13.employment_type
14.required_experience
15.required_education
16.industry
17.function
18.fraudulent

🎯 Target variable distribution:
  Real Jobs(0): 17,014 (95.2%)
  Fake Jobs(1): 866 (4.8%)

❓ Missing values analysis: 
