In [0]:
df = spark.read.csv(
    'dbfs:/FileStore/BDA_Datasets/iris.csv',
    header=True,
    inferSchema=True
)

df.show(5)
df.printSchema()


In [0]:
from pyspark.ml.feature import VectorAssembler

feature_cols = ["Sepal_Length", "Sepal_Width", "Petal_Length", "Petal_Width"]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

data = assembler.transform(df)
data.select("features").show(5, truncate=False)


In [0]:
from pyspark.ml.clustering import KMeans

k = 3  # number of clusters
kmeans = KMeans(k=k, seed=42, featuresCol="features", predictionCol="cluster")

model = kmeans.fit(data)

predictions = model.transform(data)
predictions.show(5)


In [0]:
from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator(
    featuresCol="features",
    predictionCol="cluster",
    metricName="silhouette"
)

silhouette = evaluator.evaluate(predictions)
print("Silhouette Score =", silhouette)


In [0]:
centers = model.clusterCenters()
for i, c in enumerate(centers):
    print(f"Cluster {i} center: {c}")


In [0]:
import pandas as pd
import matplotlib.pyplot as plt

pdf = predictions.select(
    "Sepal_Length", "Sepal_Width", "cluster"
).toPandas()

plt.figure(figsize=(8,6))
plt.scatter(pdf["Sepal_Length"], pdf["Sepal_Width"], c=pdf["cluster"], cmap="viridis")
plt.xlabel("Sepal Length")
plt.ylabel("Sepal Width")
plt.title("KMeans Clusters (k=3)")
plt.show()


In [0]:
dark = spark.read.csv(
    "dbfs:/FileStore/darknet_group_project/Darknet.CSV",
    header=True,
    inferSchema=True,
    nullValue="N/A",  
    emptyValue=""
)

In [0]:
dark = spark.read.csv(
    "dbfs:/FileStore/darknet_group_project/Darknet.CSV",
    header=True,
    inferSchema=True,
    nullValue="N/A",        # treat N/A as NULL
    emptyValue="",          # treat empty as NULL
    nanValue="Infinity",    # treat Infinity as NULL
)


In [0]:
dark.printSchema()

In [0]:
display(dark)

In [0]:
dark.columns

In [0]:
cols_to_drop = [
    "Flow ID", "Src IP", "Dst IP", "Src Port", "Dst Port",
    "Timestamp", 
    "Protocol"  # optional
]

dark_clean = dark.drop(*cols_to_drop)
display(dark_clean)

In [0]:
from pyspark.sql.functions import col, isnan, when, count, lit


# 1. Initialize an empty dictionary to store results (optional, but good practice)
nan_counts = {}
zero_counts = {}

# 2. Loop through all column names
print("--- NaN Counts per Column ---")
for column_name in dark_clean.columns:
    
    # 3. Calculate the NaN count for the current column
    # isnan(col(column_name)) checks for NaN values.
    # when() assigns 1 if it's NaN, otherwise 0.
    # count() sums up the 1s (the NaN occurrences).
    nan_count = dark_clean.agg(
        count(when(isnan(col(column_name)), column_name))
    ).collect()[0][0]
    
    # Store and print the result
    nan_counts[column_name] = nan_count

    # Check for (value == 0) AND (NOT missing)
    zero_count = dark_clean.agg(
        count(when(
            (col(column_name) == 0) & col(column_name).isNotNull() & ~isnan(col(column_name)), lit(1)
        ))
    ).collect()[0][0]

# How it works:
# It returns the column value ONLY when it is a non-missing zero, and count() tallies those values.
    
    zero_counts[column_name] = zero_count

    #print(f"Column '{column_name}': {zero_counts}")
    #print(f"Column '{column_name}': {nan_count} NaN values")

print("---------------------------")

In [0]:
drop_cols_further = []

for item in zero_counts:
    if zero_counts[item]/dark_clean.count()>.5:
        #print(f"Column '{item}' has {zero_counts[item]} zeros.")
        drop_cols_further.append(item)



In [0]:
print(len(dark_clean.columns), len(dark_clean.drop(*drop_cols_further).columns))
dark_clean = dark_clean.drop(*drop_cols_further)

In [0]:
print(len(dark_clean.columns))


In [0]:
from pyspark.ml.feature import Imputer
# Create the Imputer transformer
imputer = Imputer(
    inputCols=[c for c in dark_clean.columns if dict(dark_clean.dtypes)[c] in ('int', 'double', 'float', 'long')], 
    outputCols=[c+"_imputed" for c in dark_clean.columns if dict(dark_clean.dtypes)[c] in ('int', 'double', 'float', 'long')]
).setStrategy("median") # Specifies that the median should be used

imputed_df = imputer.fit(dark_clean).transform(dark_clean)

In [0]:
display(imputed_df)

In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=[c+"_imputed" for c in dark_clean.columns if dict(dark_clean.dtypes)[c] in ('int', 'double', 'float', 'long')],   
    outputCol="features",
    handleInvalid="skip" 
)

data = assembler.transform(imputed_df)


In [0]:
print(data.count())

In [0]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(
    inputCol="features",
    outputCol="scaled_features",
    withMean=True,     # recommended for KMeans
    withStd=True       # standard scaling
)

scaler_model = scaler.fit(data)
scaled_data = scaler_model.transform(data)


In [0]:
display(scaled_data)

In [0]:
scaled_data.count()

In [0]:
from pyspark.sql.functions import col, lower, regexp_replace, initcap

data = data.withColumn(
    "Label84",
    initcap(
        regexp_replace(
            regexp_replace(
                lower(col("Label84")),
                "audio-streaming", "audio-streaming"   # keep hyphen
            ),
            "file-transfer", "file-transfer"         # keep hyphen
        )
    )
)



In [0]:
data.groupby("Label84").count().show()

In [0]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(
    k=8,
    seed=42,
    featuresCol="features",
    predictionCol="cluster"
)

model = kmeans.fit(data)
predictions = model.transform(data)


In [0]:
display(predictions)

In [0]:
predictions.groupby(["cluster", "Label84"]).count().show()

In [0]:
predictions.crosstab("cluster", "Label84").orderBy("cluster_Label84").show(truncate=False)
