In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
df = pd.read_csv('fake_job_postings.csv')
print(df)

       job_id                                              title  \
0           1                                   Marketing Intern   
1           2          Customer Service - Cloud Video Production   
2           3            Commissioning Machinery Assistant (CMA)   
3           4                  Account Executive - Washington DC   
4           5                                Bill Review Manager   
...       ...                                                ...   
17875   17876                   Account Director - Distribution    
17876   17877                                 Payroll Accountant   
17877   17878  Project Cost Control Staff Engineer - Cost Con...   
17878   17879                                   Graphic Designer   
17879   17880                         Web Application Developers   

                   location   department salary_range  \
0          US, NY, New York    Marketing          NaN   
1            NZ, , Auckland      Success          NaN   
2           

In [3]:
df = df.drop(columns = ["job_id","title","location","department","salary_range"])
print(df)

                                         company_profile  \
0      We're Food52, and we've created a groundbreaki...   
1      90 Seconds, the worlds Cloud Video Production ...   
2      Valor Services provides Workforce Solutions th...   
3      Our passion for improving quality of life thro...   
4      SpotSource Solutions LLC is a Global Human Cap...   
...                                                  ...   
17875  Vend is looking for some awesome new talent to...   
17876  WebLinc is the e-commerce platform and service...   
17877  We Provide Full Time Permanent Positions for m...   
17878                                                NaN   
17879  Vend is looking for some awesome new talent to...   

                                             description  \
0      Food52, a fast-growing, James Beard Award-winn...   
1      Organised - Focused - Vibrant - Awesome!Do you...   
2      Our client, located in Houston, is actively se...   
3      THE COMPANY: ESRI – Environmenta

In [4]:
df.isna().any()

company_profile         True
description             True
requirements            True
benefits                True
telecommuting          False
has_company_logo       False
has_questions          False
employment_type         True
required_experience     True
required_education      True
industry                True
function                True
fraudulent             False
dtype: bool

In [5]:
df['employment_type'] = df['employment_type'].fillna('Other')
df['required_experience'] = df['required_experience'].fillna('Not Applicable')
df["required_education"] = df["required_education"].fillna('Unspecified')
df['industry'] = df['industry'].fillna("Other")
df['function'] = df['function'].fillna("Other")

In [6]:
df.isna().any()

company_profile         True
description             True
requirements            True
benefits                True
telecommuting          False
has_company_logo       False
has_questions          False
employment_type        False
required_experience    False
required_education     False
industry               False
function               False
fraudulent             False
dtype: bool

In [7]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to C:\Users\Ritvik
[nltk_data]     Bhardwaj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
df.isna().any()

company_profile         True
description             True
requirements            True
benefits                True
telecommuting          False
has_company_logo       False
has_questions          False
employment_type        False
required_experience    False
required_education     False
industry               False
function               False
fraudulent             False
dtype: bool

In [9]:
df['function'].value_counts()

function
Other                     6780
Information Technology    1749
Sales                     1468
Engineering               1348
Customer Service          1229
Marketing                  830
Administrative             630
Design                     340
Health Care Provider       338
Education                  325
Management                 317
Business Development       228
Accounting/Auditing        212
Human Resources            205
Project Management         183
Finance                    172
Consulting                 144
Writing/Editing            132
Art/Creative               132
Production                 116
Product Management         114
Quality Assurance          111
Advertising                 90
Business Analyst            84
Data Analyst                82
Public Relations            76
Manufacturing               74
General Business            68
Research                    50
Legal                       47
Strategy/Planning           46
Training                    38

In [10]:
df['industry'].value_counts()

industry
Other                                  4903
Information Technology and Services    1734
Computer Software                      1376
Internet                               1062
Marketing and Advertising               828
                                       ... 
Shipbuilding                              1
Sporting Goods                            1
Museums and Institutions                  1
Wine and Spirits                          1
Ranching                                  1
Name: count, Length: 132, dtype: int64

In [11]:
df['employment_type'].value_counts()

employment_type
Full-time    11620
Other         3698
Contract      1524
Part-time      797
Temporary      241
Name: count, dtype: int64

In [12]:
df['required_experience'].value_counts()

required_experience
Not Applicable      8166
Mid-Senior level    3809
Entry level         2697
Associate           2297
Director             389
Internship           381
Executive            141
Name: count, dtype: int64

In [13]:
df['required_education'].value_counts()

required_education
Unspecified                          9502
Bachelor's Degree                    5145
High School or equivalent            2080
Master's Degree                       416
Associate Degree                      274
Certification                         170
Some College Coursework Completed     102
Professional                           74
Vocational                             49
Some High School Coursework            27
Doctorate                              26
Vocational - HS Diploma                 9
Vocational - Degree                     6
Name: count, dtype: int64

In [14]:
df['company_profile'] = df['company_profile'].fillna("Unknown")
df['description'] = df['description'].fillna("Unknown")
df['requirements'] = df['requirements'].fillna("Unknown")
df['benefits'] = df['benefits'].fillna("None")

In [15]:
df.isna().any()

company_profile        False
description            False
requirements           False
benefits               False
telecommuting          False
has_company_logo       False
has_questions          False
employment_type        False
required_experience    False
required_education     False
industry               False
function               False
fraudulent             False
dtype: bool

In [16]:
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


nltk.download('wordnet')


# Fill missing values
df['company_profile'] = df['company_profile'].fillna("")

# Text Preprocessing: Cleaning + Lemmatization
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words.discard('not')  # Keeping the "not" for sentiment preservation

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower().split()
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]
    return " ".join(text)

df['company_profile'] = df['company_profile'].apply(preprocess_text)

# Tokenization
tokenizer_company_profile = Tokenizer(num_words=5000, oov_token="<OOV>")  # Keep top 5000 words
tokenizer_company_profile.fit_on_texts(df['company_profile'])
sequences = tokenizer_company_profile.texts_to_sequences(df['company_profile'])

lengths = [len(seq) for seq in sequences]  
max_length = int(np.percentile(lengths, 90))
X_text_company_profile = pad_sequences(sequences, maxlen=max_length, padding='post')
print("Optimal max_length:", max_length)

print("Tokenized and padded text shape:", X_text_company_profile.shape)


[nltk_data] Downloading package wordnet to C:\Users\Ritvik
[nltk_data]     Bhardwaj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Optimal max_length: 128
Tokenized and padded text shape: (17880, 128)


In [17]:
df['company_profile']

0        food created groundbreaking award winning cook...
1        second world cloud video production service se...
2        valor service provides workforce solution meet...
3        passion improving quality life geography heart...
4        spotsource solution llc global human capital m...
                               ...                        
17875    vend looking awesome new talent come join u wo...
17876    weblinc e commerce platform service provider f...
17877    provide full time permanent position many medi...
17878                                              unknown
17879    vend looking awesome new talent come join u wo...
Name: company_profile, Length: 17880, dtype: object

In [18]:
print(X_text_company_profile)

[[ 244  316 2214 ...    0    0    0]
 [ 370  388  113 ... 2411  559    6]
 [ 324    3  203 ...    0    0    0]
 ...
 [  33   61   22 ...    0    0    0]
 [  30    0    0 ...    0    0    0]
 [ 592  483  213 ...   44  556  205]]


In [19]:
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


nltk.download('wordnet')


# Fill missing values
df['description'] = df['description'].fillna("")

# Text Preprocessing: Cleaning + Lemmatization
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words.discard('not')  # Keep "not" for sentiment preservation



df['description'] = df['description'].apply(preprocess_text)

# Tokenization
tokenizer_description = Tokenizer(num_words=5000, oov_token="<OOV>")  # Keep top 5000 words
tokenizer_description.fit_on_texts(df['description'])
sequences = tokenizer_description.texts_to_sequences(df['description'])

lengths = [len(seq) for seq in sequences]  
max_length = int(np.percentile(lengths, 90))
X_text_description = pad_sequences(sequences, maxlen=max_length, padding='post')
print("Optimal max_length:", max_length)

print("Tokenized and padded text shape:", X_text_description.shape)


[nltk_data] Downloading package wordnet to C:\Users\Ritvik
[nltk_data]     Bhardwaj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Optimal max_length: 219
Tokenized and padded text shape: (17880, 219)


In [20]:
df['description']

0        food fast growing james beard award winning on...
1        organised focused vibrant awesome passion cust...
2        client located houston actively seeking experi...
3        company esri environmental system research ins...
4        job title itemization review managerlocation f...
                               ...                        
17875    case first time visited website vend award win...
17876    payroll accountant focus primarily payroll fun...
17877    experienced project cost control staff enginee...
17878    nemsia studio looking experienced visual graph...
17879    vend award winning web based point sale softwa...
Name: description, Length: 17880, dtype: object

In [21]:
print(X_text_description)

[[ 406  120   69 ...    0    0    0]
 [2733  438 1818 ...    0    0    0]
 [  10  417 1701 ...    0    0    0]
 ...
 [ 194   17  179 ...    0    0    0]
 [   1 1106   18 ...    0    0    0]
 [1159  591  669 ...    0    0    0]]


In [22]:
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


nltk.download('wordnet')


# Fill missing values
df['requirements'] = df['requirements'].fillna("")

# Text Preprocessing: Cleaning + Lemmatization
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words.discard('not')  # Keep "not" for sentiment preservation


df['requirements'] = df['requirements'].apply(preprocess_text)

# Tokenization
tokenizer_requirements = Tokenizer(num_words=5000, oov_token="<OOV>")  # Keep top 5000 words
tokenizer_requirements.fit_on_texts(df['requirements'])
sequences = tokenizer_requirements.texts_to_sequences(df['requirements'])

lengths = [len(seq) for seq in sequences]  
max_length = int(np.percentile(lengths, 90))
X_text_requirements = pad_sequences(sequences, maxlen=max_length, padding='post')
print("Optimal max_length:", max_length)

print("Tokenized and padded text shape:", X_text_requirements.shape)


[nltk_data] Downloading package wordnet to C:\Users\Ritvik
[nltk_data]     Bhardwaj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Optimal max_length: 122
Tokenized and padded text shape: (17880, 122)


In [23]:
df['requirements']

0        experience content management system major plu...
1        expect key responsibility communicate client s...
2        implement pre commissioning commissioning proc...
3        education bachelor master gi business administ...
4        qualification rn license state texasdiploma ba...
                               ...                        
17875    ace role eat comprehensive statement work brea...
17876    b b accounting desire fun love genuine passion...
17877    least year professional experience ability wor...
17878    must fluent latest version corel amp adobe cc ...
17879    want hear depth understanding oo programmingyo...
Name: requirements, Length: 17880, dtype: object

In [24]:
print(X_text_requirements)

[[   2  179   11 ...    0    0    0]
 [ 290  186   65 ...  491  196 2354]
 [ 281 3465 3465 ...  663   23 1895]
 ...
 [  69    5   72 ...    0    0    0]
 [  12  453  721 ...    0    0    0]
 [ 330 1390  620 ...    0    0    0]]


In [25]:
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


nltk.download('wordnet')


# Fill missing values
df['benefits'] = df['benefits'].fillna("")

# Text Preprocessing: Cleaning + Lemmatization
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words.discard('not')  # Keep "not" for sentiment preservation


df['benefits'] = df['benefits'].apply(preprocess_text)

# Tokenization
tokenizer_benefits = Tokenizer(num_words=5000, oov_token="<OOV>")  # Keep top 5000 words
tokenizer_benefits.fit_on_texts(df['benefits'])
sequences = tokenizer_benefits.texts_to_sequences(df['benefits'])

lengths = [len(seq) for seq in sequences]  
max_length = int(np.percentile(lengths, 90))
X_text_benefits = pad_sequences(sequences, maxlen=max_length, padding='post')
print("Optimal max_length:", max_length)

print("Tokenized and padded text shape:", X_text_benefits.shape)


[nltk_data] Downloading package wordnet to C:\Users\Ritvik
[nltk_data]     Bhardwaj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Optimal max_length: 62
Tokenized and padded text shape: (17880, 62)


In [26]:
df['benefits']

0                                                     none
1        get usthrough part second team gain experience...
2                                                     none
3        culture anything corporate collaborative creat...
4                                     full benefit offered
                               ...                        
17875    expect u open culture openly share result inpu...
17876    health amp wellnessmedical planprescription dr...
17877                                                 none
17878    competitive salary compensation based experien...
17879                                                 none
Name: benefits, Length: 17880, dtype: object

In [27]:
print(X_text_benefits)

[[   2    0    0 ...    0    0    0]
 [ 515 3060   30 ...  126 2024  106]
 [   2    0    0 ...    0    0    0]
 ...
 [   2    0    0 ...    0    0    0]
 [   6    9   54 ...    0    0    0]
 [   2    0    0 ...    0    0    0]]


In [28]:
categorical_cols = ['employment_type', 'required_experience', 'required_education', 'industry', 'function']
print(df[categorical_cols].shape)  # Should be (17880, 5)


(17880, 5)


In [29]:
from sklearn.preprocessing import OneHotEncoder

one_hot_enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Transform categorical features
X_categorical = one_hot_enc.fit_transform(df[categorical_cols])

# Check shape after encoding
print("After One-Hot Encoding:", X_categorical.shape)  # Should be (17880, N)


After One-Hot Encoding: (17880, 194)


In [30]:
print(X_categorical)

[[0. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]


In [31]:
print("Before reshaping:", X_categorical.shape)


Before reshaping: (17880, 194)


In [32]:
X_text_company_profile = np.array(X_text_company_profile)
X_text_description = np.array(X_text_description)
X_text_requirements = np.array(X_text_requirements)
X_text_benefits = np.array(X_text_benefits)
X_categorical = np.array(X_categorical)

In [33]:
# Select boolean features
boolean_features = ['has_company_logo', 'telecommuting', 'has_questions']

# Convert to numpy array
X_boolean = df[boolean_features].values
print("Boolean Features Shape:", X_boolean.shape)


Boolean Features Shape: (17880, 3)


In [34]:
X_final = np.hstack([
    X_text_company_profile, 
    X_text_description, 
    X_text_requirements, 
    X_text_benefits, 
    X_categorical,
    X_boolean
])


In [35]:
print(X_final)

[[2.440e+02 3.160e+02 2.214e+03 ... 1.000e+00 0.000e+00 0.000e+00]
 [3.700e+02 3.880e+02 1.130e+02 ... 1.000e+00 0.000e+00 0.000e+00]
 [3.240e+02 3.000e+00 2.030e+02 ... 1.000e+00 0.000e+00 0.000e+00]
 ...
 [3.300e+01 6.100e+01 2.200e+01 ... 0.000e+00 0.000e+00 0.000e+00]
 [3.000e+01 0.000e+00 0.000e+00 ... 0.000e+00 0.000e+00 1.000e+00]
 [5.920e+02 4.830e+02 2.130e+02 ... 1.000e+00 0.000e+00 1.000e+00]]


In [36]:
y = df['fraudulent'].values
print(y)

[0 0 0 ... 0 0 0]


In [37]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [38]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_final, y)

# Check new class distribution
from collections import Counter
print("Class distribution after SMOTE:", Counter(y_balanced))




Class distribution after SMOTE: Counter({0: 17014, 1: 8507})


In [39]:
print(y_balanced)

[0 0 0 ... 1 1 1]


In [40]:
print(X_balanced)

[[2.44000000e+02 3.16000000e+02 2.21400000e+03 ... 1.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [3.70000000e+02 3.88000000e+02 1.13000000e+02 ... 1.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [3.24000000e+02 3.00000000e+00 2.03000000e+02 ... 1.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [3.91504176e+01 2.61440502e-01 2.09152402e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [8.68932106e+00 4.48259109e+01 2.20455299e+00 ... 7.34850998e-01
  0.00000000e+00 7.34850998e-01]
 [3.00000000e+01 0.00000000e+00 0.00000000e+00 ... 1.68933551e-02
  0.00000000e+00 0.00000000e+00]]


In [41]:
X_balanced.shape

(25521, 728)

In [42]:
y_balanced.shape

(25521,)

In [43]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_balanced,y_balanced,test_size = 0.2)


In [44]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam

# Define the MLP model
model = Sequential([
    Input(shape=(X_train.shape[1],)),  # Input layer matching feature size
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/20
[1m638/638[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.7999 - loss: 0.4072 - val_accuracy: 0.9365 - val_loss: 0.1560
Epoch 2/20
[1m638/638[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9410 - loss: 0.1500 - val_accuracy: 0.9628 - val_loss: 0.0939
Epoch 3/20
[1m638/638[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9642 - loss: 0.0954 - val_accuracy: 0.9661 - val_loss: 0.0873
Epoch 4/20
[1m638/638[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9738 - loss: 0.0736 - val_accuracy: 0.9769 - val_loss: 0.0647
Epoch 5/20
[1m638/638[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9753 - loss: 0.0662 - val_accuracy: 0.9773 - val_loss: 0.0567
Epoch 6/20
[1m638/638[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9782 - loss: 0.0575 - val_accuracy: 0.9771 - val_loss: 0.0603
Epoch 7/20
[1m638/638[0m 

In [45]:
print("Input shape for prediction:", X_test.shape)


Input shape for prediction: (5105, 728)


In [46]:
# Evaluate on Test Set
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Generate Classification Report
from sklearn.metrics import classification_report

y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(y_pred)
print("Classification Report:\n", classification_report(y_test, y_pred))


[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9905 - loss: 0.0328
Test Accuracy: 98.84%
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[[0]
 [1]
 [1]
 ...
 [0]
 [0]
 [1]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99      3412
           1       0.97      1.00      0.98      1693

    accuracy                           0.99      5105
   macro avg       0.98      0.99      0.99      5105
weighted avg       0.99      0.99      0.99      5105



In [47]:
model.save("fake_job_detection.h5")
print("Model saved successfully!")



Model saved successfully!


In [48]:
import joblib

# Save the tokenizer
joblib.dump(tokenizer_company_profile, "tokenizer_company_profile.pkl")
joblib.dump(tokenizer_description, "tokenizer_description.pkl")
joblib.dump(tokenizer_requirements, "tokenizer_requirements.pkl")
joblib.dump(tokenizer_benefits, "tokenizer_benefits.pkl")

print("Tokenizer saved successfully!")
joblib.dump(one_hot_enc, "one_hot_encoder.pkl")
print("OneHotEncoder saved successfully!")


Tokenizer saved successfully!
OneHotEncoder saved successfully!


In [49]:
print("Original class distribution:", Counter(y))
print("After SMOTE:", Counter(y_balanced))


Original class distribution: Counter({0: 17014, 1: 866})
After SMOTE: Counter({0: 17014, 1: 8507})


In [50]:
import numpy as np
import tensorflow as tf

model = tf.keras.models.load_model("fake_job_detection.h5")

# Create a random input of the expected shape
random_input = np.random.rand(1, 728)

# Run prediction
output = model.predict(random_input)
print(output)  # This should print a valid prediction between 0 and 1




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step
[[0.9978559]]
