In [0]:
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
import pyspark.sql.types as T
import math

from pyspark.sql.types import DoubleType
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    RegexTokenizer,
    StopWordsRemover,
    NGram,
    CountVectorizer,
    IDF,
    VectorAssembler,
    StringIndexer,
    HashingTF,
    IndexToString,
    ChiSqSelector
)
from pyspark.ml.classification import (
    LogisticRegression,
    LinearSVC,
    RandomForestClassifier,
    OneVsRest
)
from pyspark.ml.evaluation import (
    BinaryClassificationEvaluator,
    MulticlassClassificationEvaluator
)
from pyspark.ml.linalg import (
    SparseVector,
    DenseVector,
    Vectors,
    Vector
)
from pyspark.ml.tuning import (
    CrossValidator,
    ParamGridBuilder
)

In [0]:
df = spark.read.table("census_bureau_capstone.silver.census_product_cleaned")

print(df.count())
display(df.limit(100))

In [0]:
def normalize_text(colname="text"):
    return F.trim(F.regexp_replace(F.lower(F.col(colname)), r"\s+", " "))

df = df.withColumn("text_norm", normalize_text("content"))
df = df.filter(F.length("text_norm") > 100)

In [0]:
# --- Core Bureau names / general references ---
census_terms = {
    # Bureau & generic references
    "census_bureau":         r"\b(u\.?\s*s\.?\s*)?census\s*bureau\b",
    "us_census":             r"\bu\.?\s*s\.?\s*census\b",
    "census_generic":        r"\b(the\s+)?census(?!\s*of\s*agriculture)\b",  # avoid agriculture (USDA)
    "census_gov_domain":     r"\b(census\.gov|data\.census\.gov|api\.census\.gov|factfinder\.census\.gov)\b",

    # Decennial Census (Population & Housing)
    "decennial_census":      r"\b(decennial\s+census|census\s+20\d{2}|population\s+and\s+housing\s+census)\b",
    "sf_summary_files":      r"\b(summary\s+file\s*(?:1|2)|sf(?:1|2)\b)",
    "redistricting_pl94":    r"\b(p\.?l\.?\s*94[-–]171|redistricting\s+data)\b",
    "pums":                  r"\b(public\s+use\s+microdata\s+sample|pums)\b",
    "short_long_form":       r"\b(short\s+form|long\s+form)\b",

    # Economic programs & products
    "economic_census":       r"\beconomic\s+census\b",
    "census_of_govts":       r"\bcensus\s+of\s+governments\b",
    "cbp":                   r"\b(county\s+business\s+patterns|cbp)\b",
    "bds":                   r"\b(business\s+dynamics\s+statistics|bds)\b",
    "lbd":                   r"\b(longitudinal\s+business\s+database|lbd)\b",
    "qwi_led_onthemap":      r"\b(qwi|lehd|lodes|on\s*the\s*map|onthemap)\b",

    # Household surveys managed by Census (select)
    "cps":                   r"\b(current\s+population\s+survey|cps)\b",
    "sipp":                  r"\b(survey\s+of\s+income\s+and\s+program\s+participation|sipp)\b",
    "ahs":                   r"\b(american\s+housing\s+survey|ahs)\b",
    # note: keep ACS separate; included later but your ask is “beyond ACS”

    # Geography / reference files
    "tiger_line":            r"\b(tiger(?:/line)?\s*(?:shapefiles?)?|tiger/line|tigerline)\b",
    "gazetteer":             r"\bgazetteer\b",
    "fips":                  r"\bfips\s*(?:codes?)?\b",
    "tract_block_group":     r"\b(census\s*tracts?|block\s*groups?|census\s*blocks?)\b",
    "places_counties_mcd":   r"\b(incorporated\s*places?|counties|minor\s*civil\s*divisions?|mcds?)\b",
    "puma":                  r"\b(puma|public\s+use\s+microdata\s+area[s]?)\b",

    # Data portals / APIs / legacy brands
    "data_portals":          r"\b(data\.census\.gov|api\.census\.gov|factfinder|american\s+factfinder)\b",
    "microdata_api":         r"\b(microdata\s*api|pums\s*api)\b",

    # Attribution lines and “according to” language
    "attribution_1":         r"(source\s*:\s*(?:the\s+)?u\.?\s*s\.?\s*census\s*bureau)",
    "attribution_2":         r"(according\s+to\s+(?:the\s+)?u\.?\s*s\.?\s*census\s*bureau)",
    "attribution_3":         r"(from\s+(?:the\s+)?u\.?\s*s\.?\s*census\s*bureau)",
}


In [0]:
# Add a boolean column per pattern
for colname, pattern in census_terms.items():
    df = df.withColumn(colname, F.col("text_norm").rlike(pattern))

# Signal columns for census term detection
signal_cols = list(census_terms.keys())
df = df.withColumn(
    "has_any_census",
    F.array_max(F.array(*[F.col(c).cast("int") for c in signal_cols])) == 1
)

# Numeric density feature
digits_len = F.length(F.regexp_replace(F.col("text_norm"), r"[^0-9]", ""))
text_len = F.length(F.col("text_norm"))
df = df.withColumn(
    "num_density",
    digits_len / F.when(text_len > 0, text_len).otherwise(F.lit(1.0))
)

# Attribution and domain signals
df = (
    df.withColumn(
        "has_explicit_attrib",
        F.col("attribution_1") | F.col("attribution_2") | F.col("attribution_3")
    )
    .withColumn(
        "has_census_domain",
        F.col("census_gov_domain") | F.col("data_portals") | F.col("microdata_api")
    )
    .withColumn(
        "repackage_heuristic",
        (F.col("num_density") > 0.12)
        & (~F.col("has_explicit_attrib"))
        & (~F.col("has_census_domain"))
    )
)

In [0]:
# Define label signals
cites_signal = (
    F.col("has_explicit_attrib") | 
    F.col("has_census_domain")
)

repack_signal = (
    (F.col("num_density") > 0.12) & 
    (~F.col("has_explicit_attrib")) & 
    (~F.col("has_census_domain"))
)

# Assign labels
df_lab = (
    df.withColumn(
        "label",
        F.when(cites_signal, "cites")
         .when(repack_signal, "repackages")
    )
    .filter(F.col("label").isNotNull())
)

# Save and display label counts
df_lab.write.mode("overwrite").saveAsTable("census_bureau_capstone.gold.census_repackaged_enriched")
print(df_lab.count())
print("Label counts:")
display(df_lab.groupBy("label").count())

##### Graphing results

In [0]:
label_counts_pdf = (
    df_lab.groupBy("label")
          .count()
          .orderBy("label")
          .toPandas()
)

plt.figure(figsize=(6, 4))
plt.bar(label_counts_pdf["label"], label_counts_pdf["count"])
plt.title("Appendix A: Label distribution")
plt.xlabel("Label")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [0]:
display(df_lab.limit(100))

In [0]:
# Check class distribution
label_counts = (
    df_lab.groupBy("label")
    .count()
    .collect()
)
label_counts

In [0]:
# Compute label counts dictionary
counts = {row["label"]: row["count"] for row in label_counts}

# Find the maximum count for normalization
max_count = max(counts.values())

# Assign higher weight to rare classes
class_weights = {
    label: float(max_count) / count
    for label, count in counts.items()
}

class_weights

In [0]:
# Build class_weight column using class_weights dictionary
from pyspark.sql import functions as F

weight_expr = None
for lbl, w in class_weights.items():
    cond = F.col("label") == lbl
    if weight_expr is None:
        weight_expr = F.when(cond, w)
    else:
        weight_expr = weight_expr.when(cond, w)
weight_expr = weight_expr.otherwise(1.0)

df_w = df_lab.withColumn("class_weight", weight_expr)
display(df_w)

In [0]:
# 1) Label indexer: Converts string labels ("cites", "repackages") to numeric indices (0/1)
label_indexer = StringIndexer(
    inputCol="label",
    outputCol="label_idx"
)

# 2) Text pipeline: Tokenizes and cleans text for feature extraction
tokenizer = RegexTokenizer(
    inputCol="text_norm",      # normalized text column
    outputCol="tokens",        # output token list
    pattern="\\W"              # split on non-word characters
)

remover = StopWordsRemover(
    inputCol="tokens",         # input tokens
    outputCol="filtered"       # output tokens with stopwords removed
)

hashing = HashingTF(
    inputCol="filtered",       # input filtered tokens
    outputCol="tf_raw",        # output term frequency vector
    numFeatures=2**12          # number of features for hashing
)

# 3) Heuristic features: Ensure numeric features are double type for ML compatibility
heuristic_cols = [
    "num_density",             # numeric density of digits in text
    "has_any_census",          # boolean signal for census term detection
]

for colname in heuristic_cols:
    df_w = df_w.withColumn(colname, F.col(colname).cast("double"))

# 4) Assemble all features: Combines text features and heuristics into a single feature vector
assembler = VectorAssembler(
    inputCols=["tf_raw"],      # currently only text features; add heuristics if needed
    outputCol="features"
)

### SVM Model - Might be too heavy to run on the free version of Databrics

In [0]:
# 5) SVM model
svm = LinearSVC(
    featuresCol="features",
    labelCol="label_idx",
    weightCol="class_weight",    # handles imbalance
    maxIter=50,
    regParam=0.1
)

pipeline = Pipeline(stages=[
    label_indexer,
    tokenizer,
    remover,
    hashing,
    assembler,
    svm
])

In [0]:
train_df, test_df = df_w.randomSplit([0.8, 0.2], seed=42)

svm_model = pipeline.fit(train_df)

In [0]:
# Generate predictions
pred = svm_model.transform(test_df)

# Evaluate AUC
auc_eval = BinaryClassificationEvaluator(
    labelCol="label_idx",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)
auc = auc_eval.evaluate(pred)

# Evaluate F1 score
f1_eval = MulticlassClassificationEvaluator(
    labelCol="label_idx",
    predictionCol="prediction",
    metricName="f1"
)
f1 = f1_eval.evaluate(pred)

print("AUC:", auc)
print("F1:", f1)

# Confusion matrix by original string label
display(
    pred.groupBy("label", "prediction")
        .count()
        .orderBy("label", "prediction")
)

In [0]:
# True Positives (TP) and False Negatives (FN) for "repackages"
pred_repack = pred.filter(F.col("label") == "repackages")
tp = pred_repack.filter(F.col("prediction") == 1).count()
fn = pred_repack.filter(F.col("prediction") == 0).count()

# True Negatives (TN) and False Positives (FP) for "cites"
pred_cites = pred.filter(F.col("label") == "cites")
tn = pred_cites.filter(F.col("prediction") == 0).count()
fp = pred_cites.filter(F.col("prediction") == 1).count()

# Precision and recall for "repackages"
precision_repack = tp / float(tp + fp) if (tp + fp) > 0 else None
recall_repack = tp / float(tp + fn) if (tp + fn) > 0 else None

print("Repackages precision:", precision_repack)
print("Repackages recall   :", recall_repack)

### Logistic Regression 

SVM is a heavy model. The more data that is ingested, the less likely it'll be able to run in the free version of Databricks

In [0]:
lr = LogisticRegression(
    featuresCol="features",
    labelCol="label_idx",
    weightCol="class_weight",
    maxIter=30,
    regParam=0.1
)

pipeline_lr = Pipeline(stages=[
    label_indexer,
    tokenizer,
    remover,
    hashing,
    assembler,
    lr
])

In [0]:
lr_model = pipeline_lr.fit(train_df)

In [0]:
# Generate predictions with logistic regression
pred_lr = lr_model.transform(test_df)

# Evaluate AUC
auc_eval_lr = BinaryClassificationEvaluator(
    labelCol="label_idx",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)
auc_lr = auc_eval_lr.evaluate(pred_lr)

# Evaluate F1 score
f1_eval_lr = MulticlassClassificationEvaluator(
    labelCol="label_idx",
    predictionCol="prediction",
    metricName="f1"
)
f1_lr = f1_eval_lr.evaluate(pred_lr)

print("AUC:", auc_lr)
print("F1:", f1_lr)

# Confusion matrix by original string label
display(
    pred_lr.groupBy("label", "prediction")
          .count()
          .orderBy("label", "prediction")
)

In [0]:
# For quick sanity check specifically on "repackages"

# True Positives (TP) and False Negatives (FN) for "repackages"
pred_lr_repack = pred_lr.filter(F.col("label") == "repackages")
tp_lr = pred_lr_repack.filter(F.col("prediction") == 1).count()
fn_lr = pred_lr_repack.filter(F.col("prediction") == 0).count()

# True Negatives (TN) and False Positives (FP) for "cites"
pred_lr_cites = pred_lr.filter(F.col("label") == "cites")
tn_lr = pred_lr_cites.filter(F.col("prediction") == 0).count()
fp_lr = pred_lr_cites.filter(F.col("prediction") == 1).count()

# Precision and recall for "repackages"
precision_lr_repack = (
    tp_lr / float(tp_lr + fp_lr) if (tp_lr + fp_lr) > 0 else None
)
recall_lr_repack = (
    tp_lr / float(tp_lr + fn_lr) if (tp_lr + fn_lr) > 0 else None
)

print("Repackages precision:", precision_lr_repack)
print("Repackages recall   :", recall_lr_repack)

****

### Cosine Similarity (CS)

In [0]:
###### This block should be able to grab unique URLs from the dataset 

from pyspark.sql import Window

# Define window: partition by URI, order by longest normalized text
window = (
    Window
    .partitionBy("Target-URI")
    .orderBy(F.desc(F.length("text_norm")))
)

# Select one row per URI (longest text), drop helper column
df_unique = (
    df_w
    .withColumn("rn", F.row_number().over(window))
    .filter(F.col("rn") == 1)
    .drop("rn")
)

In [0]:
# Build text vectorization pipeline
text_vec_pipeline = Pipeline(stages=[
    tokenizer,
    remover,
    hashing
])

# Fit and transform to get vectorized features
text_vec_model = text_vec_pipeline.fit(df_unique)
df_vec = text_vec_model.transform(df_unique)

# Rename tf_raw to features for cosine similarity
df_vec = df_vec.withColumnRenamed("tf_raw", "features")

##### Setting up "anchor" URI with clear repackaged data

In [0]:
# 1) Already identified URI with repackaged data
anchor_uri = "https://www.schooldigger.com/go/IN/schools/0345000465/school.aspx"

# Find anchor row by URI
anchor_row = (
    df_vec
    .filter(F.col("Target-URI") == anchor_uri)
    .select("Target-URI", "label", "features")
    .limit(1)
    .collect()
)

if not anchor_row:
    raise ValueError(f"Anchor URI {anchor_uri} not found in df_vec")

anchor_vec = anchor_row[0]["features"]
anchor_label = anchor_row[0]["label"]

print("Anchor label:", anchor_label)

##### Defining CS UDF

In [0]:
def cosine_sim_py(v: Vector, anchor: Vector) -> float:
    if v is None or anchor is None:
        return None
    dot_product = float(v.dot(anchor))
    norm_v = math.sqrt(float(v.dot(v)))
    norm_anchor = math.sqrt(float(anchor.dot(anchor)))
    if norm_v == 0.0 or norm_anchor == 0.0:
        return None
    return dot_product / (norm_v * norm_anchor)

cosine_sim_udf = F.udf(
    lambda v: float(cosine_sim_py(v, anchor_vec)),
    DoubleType()
)

In [0]:
from urllib.parse import urlparse

# Extract domain from anchor URI
domain = urlparse(anchor_uri).netloc

# Compute cosine similarity and filter out same domain
df_sim = (
    df_vec
    .withColumn("cosine_to_anchor", cosine_sim_udf("features"))
    # Exclude rows from the same domain as anchor
    .filter(~F.col("Target-URI").contains(domain))
)

# Display top 20 most similar rows
display(
    df_sim
    .select(
        "Target-URI",
        "label",
        "cosine_to_anchor",
        "num_density",
        "has_any_census"
    )
    .orderBy(F.desc("cosine_to_anchor"))
    .limit(20)
)