# Spot the Scam — Data Exploration

Quick EDA for raw job postings. Run from project root with the virtualenv activated.

In [None]:
import sys
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

ROOT = Path.cwd()
SRC = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

In [None]:
from spot_scam.config.loader import load_config
from spot_scam.data.ingest import load_raw_dataset

config = load_config()
raw_df = load_raw_dataset(config)
raw_df.head()

## Missingness Overview

In [None]:
missing = raw_df.isna().mean().sort_values(ascending=False)
missing.head(10)

In [None]:
plt.figure(figsize=(8,4))
sns.histplot(raw_df['fraudulent'], discrete=True)
plt.title('Class distribution (raw)')

## Text Length Distribution

In [None]:
from spot_scam.data.preprocess import preprocess_dataframe

processed_df, _ = preprocess_dataframe(raw_df, config)
processed_df['text_len'] = processed_df['text_all'].str.len()

plt.figure(figsize=(10,4))
sns.histplot(processed_df, x='text_len', hue=config['data']['target_column'], bins=60, kde=False)
plt.title('Combined text length distribution')
plt.xlim(0, 5000)

## Top Terms (TF-IDF) — Preview

Reuse the vectorizer to show frequent n-grams in fraudulent postings.

In [None]:
from spot_scam.features.text import build_tfidf_vectorizer

vectorizer, _ = build_tfidf_vectorizer(config)
fraud_text = processed_df.query(f"{config['data']['target_column']} == 1")['text_all']
tfidf = vectorizer.fit_transform(fraud_text)
terms = vectorizer.get_feature_names_out()
freq = tfidf.sum(axis=0).A1
top_idx = freq.argsort()[::-1][:20]
pd.DataFrame({'term': terms[top_idx], 'tfidf_sum': freq[top_idx]})

## Duplicate Detection

Check sample of dropped duplicates (if any) to validate the checksum logic.

In [None]:
from spot_scam.data.split import create_splits

splits = create_splits(processed_df, config, persist=False)
len(processed_df), len(splits.train) + len(splits.val) + len(splits.test)

## Next Steps
- Dive deeper into industry/function slices.
- Use `experiments/tables/token_frequency_analysis.csv` for final insights.
- Hand off to modeling notebook once satisfied with the data quality.