# Installing Libraries and Downloading CSV

In [69]:
%pip install --upgrade pandas
%pip install --upgrade nltk
%pip install --upgrade contractions

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [70]:
import pandas as pd
from collections import Counter
import itertools 
import re
import contractions

import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('brown')
nltk.download('treebank')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/leomoore/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/leomoore/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/leomoore/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to /Users/leomoore/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     /Users/leomoore/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/leomoore/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [71]:
df_a = pd.read_csv("fake_job_postings_A.csv")
df_b = pd.read_csv("fake_job_postings_B.csv")

df = pd.concat([df_a, df_b], ignore_index=True)

df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


# Cleaning Dataframe

In [None]:
# It appears the tokenizing was already done for us, although the format is not very good for analysis
def textwash(col):
    df[col] = df[col].str.replace(r"http\w+", " url_token", regex=True) # urls
    df[col] = df[col].str.replace(r"https?://\w+", " url_token", regex=True) # urls
    df[col] = df[col].str.replace(r"#URL_\w+", " url_token", regex=True) # urls

    df[col] = df[col].str.replace(r"#EMAIL_\w+", " email_token", regex=True) # email

    df[col] = df[col].str.replace(r"$\d+", " money_token", regex=True) # money

    df[col] = df[col].str.replace("\xa0", " ") # Non-breaking space
    df[col] = df[col].str.replace("&amp", " ") # Ampersand

    df[col] = df[col].str.replace(r"[^\w\s]", " ", regex=True) # remove nonword characters generally
    df[col] = df[col].str.replace(r"\s{2,}", " ", regex=True) # reduce repeat spaces
    df[col] = df[col].str.lower()

In [87]:
# Tokenize urls, emails, and money, remove nonword characters
textwash("company_profile")
textwash("description")
textwash("requirements")
textwash("benefits")

df["description"].head()

0    food52 a fast growing james beard award winnin...
1    organised focused vibrant awesome do you have ...
2    our client located in houston is actively seek...
3    the company esri environmental systems researc...
4    job title itemization review managerlocation f...
Name: description, dtype: object

# EDA, Non-text Data Analysis, Topic Analysis

In [123]:
# Summary Statistics
import numpy as np
from scipy import stats

fraud_lens = [len(desc) for desc in df[df["fraudulent"] == 1]["description"] if not isinstance(desc, float)]
legit_lens = [len(desc) for desc in df[df["fraudulent"] == 0]["description"] if not isinstance(desc, float)]

x = stats.ttest_ind(fraud_lens, legit_lens, alternative='less')

print(f"Mean length of fraudulent description: {np.mean(fraud_lens)}\n\
      Mean length of legitimate description: {np.mean(legit_lens)}\n\
      P-value of one-sided t-test: {x.pvalue}")

Mean length of fraudulent description: 1103.6369942196532
      Mean length of legitimate description: 1170.2603174603175
      P-value of one-sided t-test: 0.012026635379512315


In [None]:
# Set parameters for stop words and min word length, to be used later

global_stopwords = nltk.corpus.stopwords.words("english")
THRESHOLD = 3

In [9]:
# Get list of words in the fraudulent descriptions

fraudulent_description = df[df["fraudulent"] == 1]["description"]

fraud_words = [nltk.word_tokenize(s) for s in fraudulent_description if isinstance(s, str)]
fraud_words = list(itertools.chain.from_iterable(fraud_words))
fraud_words = [w for w in fraud_words if len(w)> THRESHOLD and w not in global_stopwords]

fraud_words[0:5]

['icampe', 'technician', 'bakersfield', 'posoprincipal', 'duties']

In [10]:
# Get most common words in fraudulent descriptions

fraud_counts = Counter(fraud_words)
fraud_counts.most_common(20)
fraud_freq = pd.DataFrame(fraud_counts.most_common(), columns=["Fraud_Term", "Frequency"])

fraud_freq.head(10)

Unnamed: 0,Fraud_Term,Frequency
0,work,950
1,team,475
2,position,452
3,management,446
4,project,441
5,experience,435
6,business,427
7,customer,423
8,company,415
9,looking,409


In [11]:
# Get list of words in the legitimate descriptions

legitimate_description = df[df["fraudulent"] == 0]["description"]

legit_words = [nltk.word_tokenize(s) for s in legitimate_description if isinstance(s, str)]
legit_words = list(itertools.chain.from_iterable(legit_words))
legit_words = [w for w in legit_words if len(w) > THRESHOLD and w not in global_stopwords]

legit_words[0:5]

['food52', 'fastgrowing', 'james', 'beard', 'awardwinning']

In [12]:
# Get most common words in legitimate descriptions

legit_counts = Counter(legit_words)
legit_counts.most_common(20)
legit_freq = pd.DataFrame(legit_counts.most_common(), columns=["Legit_Term", "Frequency"])
legit_freq.head(10)

Unnamed: 0,Legit_Term,Frequency
0,team,16575
1,work,12989
2,business,9886
3,experience,8814
4,sales,8781
5,customer,8752
6,looking,8275
7,company,7943
8,development,7592
9,product,6817


# Sentiment Analysis

In [None]:
%pip install textblob nltk
%pip install nltk #ensuring nltk installation

333.06s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


338.78s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import nltk
nltk.download('vader_lexicon')  # VADER lexicon download/reinstall

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/elminaheder/nltk_data...


True

In [None]:
%pip install pandas

import pandas as pd

df_a = pd.read_csv("fake_job_postings_A.csv")
df_b = pd.read_csv("fake_job_postings_B.csv")
df_combined = pd.concat([df_a, df_b], ignore_index=True)  # New variable name to avoid overwriting 'df'

572.49s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 7.7 MB/s eta 0:00:01
[?25hCollecting numpy>=1.22.4
  Downloading numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 18.3 MB/s eta 0:00:01
Collecting tzdata>=2022.7
  Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
[K     |████████████████████████████████| 347 kB 11.6 MB/s eta 0:00:01
[?25hCollecting pytz>=2020.1
  Downloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
[K     |████████████████████████████████| 509 kB 16.0 MB/s eta 0:00:01
Installing collected packages: tzdata, pytz, numpy, pandas
Successfully installed numpy-2.0.2 pandas-2.2.3 pytz-2025.2 tzdata-2025.2
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you ma

In [None]:
# Sentiment Analysis 
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')

# Polarity score creation         # Range -1, 1
def textblob_sentiment(text):
    return TextBlob(str(text)).sentiment.polarity 

df_combined['textblob_sentiment'] = df_combined['description'].apply(textblob_sentiment)

# Compound score
sid = SentimentIntensityAnalyzer()

def vader_sentiment(text):
    return sid.polarity_scores(str(text))['compound'] # -1, 1

df_combined['vader_sentiment'] = df_combined['description'].apply(vader_sentiment)

# Labeling (**Can be changed later if decided not to be used**)...
def label_sentiment(score):
    if score > 0.05:
        return 'Positive'
    elif score < -0.05:
        return 'Negative'
    else:
        return 'Neutral'

df_combined['vader_label'] = df_combined['vader_sentiment'].apply(label_sentiment)


df_combined[['description', 'textblob_sentiment', 'vader_sentiment', 'vader_label']].head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/elminaheder/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,description,textblob_sentiment,vader_sentiment,vader_label
0,"Food52, a fast-growing, James Beard Award-winn...",0.093636,0.6486,Positive
1,Organised - Focused - Vibrant - Awesome!Do you...,0.251323,0.9951,Positive
2,"Our client, located in Houston, is actively se...",0.486667,0.9509,Positive
3,THE COMPANY: ESRI – Environmental Systems Rese...,0.232955,0.9957,Positive
4,JOB TITLE: Itemization Review ManagerLOCATION:...,0.118636,0.9426,Positive


# Readibility Analysis

In [None]:
# TBA