In [120]:
import kagglehub
import os
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, count,when
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [107]:
# Download latest version
path = kagglehub.dataset_download("saurabhbadole/breast-cancer-wisconsin-state")

print("Path to dataset files:", path)

# print all of the files in the directory
print("Files in dataset path:", os.listdir(path))

Path to dataset files: /home/lee/.cache/kagglehub/datasets/saurabhbadole/breast-cancer-wisconsin-state/versions/1
Files in dataset path: ['breast-cancer-wisconsin.data']


In [108]:
spark = SparkSession.builder \
    .appName("Breast Cancer Data Processing") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()
spark

In [109]:
#Load CSV file (Make sure `path` is correctly set)
df = spark.read.csv(path, header=False, inferSchema=True)

#Rename Columns Properly (Use `.toDF()`)
df = df.toDF(
    "id_number", "Clump_Thickness", "Uniformity_of_Cell_Size", "Uniformity_of_Cell_Shape",
    "Marginal_Adhesion", "Single_Epithelial_Cell_Size", "Bare_Nuclei", "Bland_Chromatin",
    "Normal_Nucleoli", "Mitoses", "Class"
)

#Print Schema and Data Sample
df.printSchema()
df.show(5)

root
 |-- id_number: integer (nullable = true)
 |-- Clump_Thickness: integer (nullable = true)
 |-- Uniformity_of_Cell_Size: integer (nullable = true)
 |-- Uniformity_of_Cell_Shape: integer (nullable = true)
 |-- Marginal_Adhesion: integer (nullable = true)
 |-- Single_Epithelial_Cell_Size: integer (nullable = true)
 |-- Bare_Nuclei: string (nullable = true)
 |-- Bland_Chromatin: integer (nullable = true)
 |-- Normal_Nucleoli: integer (nullable = true)
 |-- Mitoses: integer (nullable = true)
 |-- Class: integer (nullable = true)

+---------+---------------+-----------------------+------------------------+-----------------+---------------------------+-----------+---------------+---------------+-------+-----+
|id_number|Clump_Thickness|Uniformity_of_Cell_Size|Uniformity_of_Cell_Shape|Marginal_Adhesion|Single_Epithelial_Cell_Size|Bare_Nuclei|Bland_Chromatin|Normal_Nucleoli|Mitoses|Class|
+---------+---------------+-----------------------+------------------------+-----------------+--------

In [110]:
df.select("Bare_Nuclei").distinct().show()

+-----------+
|Bare_Nuclei|
+-----------+
|          7|
|          3|
|          8|
|          5|
|          6|
|          9|
|          1|
|         10|
|          4|
|          ?|
|          2|
+-----------+



In [111]:
df = df.withColumn("Bare_Nuclei", col("Bare_Nuclei").cast("int"))
df.printSchema()  # Check again after conversion

root
 |-- id_number: integer (nullable = true)
 |-- Clump_Thickness: integer (nullable = true)
 |-- Uniformity_of_Cell_Size: integer (nullable = true)
 |-- Uniformity_of_Cell_Shape: integer (nullable = true)
 |-- Marginal_Adhesion: integer (nullable = true)
 |-- Single_Epithelial_Cell_Size: integer (nullable = true)
 |-- Bare_Nuclei: integer (nullable = true)
 |-- Bland_Chromatin: integer (nullable = true)
 |-- Normal_Nucleoli: integer (nullable = true)
 |-- Mitoses: integer (nullable = true)
 |-- Class: integer (nullable = true)



In [112]:
for col_name in df.columns:
    print(f"Unique values in column: {col_name}")
    df.select(col_name).distinct().show()

Unique values in column: id_number
+---------+
|id_number|
+---------+
|  1047630|
|  1205579|
|   822829|
|   428598|
|  1231853|
|  1145420|
|  1371920|
|   534555|
|  1156948|
|  1180194|
|  1075123|
|  1126417|
|   657753|
|  1315506|
|  1171710|
|   740492|
|   805448|
|   608157|
|  1170420|
|  1184241|
+---------+
only showing top 20 rows

Unique values in column: Clump_Thickness
+---------------+
|Clump_Thickness|
+---------------+
|              1|
|              6|
|              3|
|              5|
|              9|
|              4|
|              8|
|              7|
|             10|
|              2|
+---------------+

Unique values in column: Uniformity_of_Cell_Size
+-----------------------+
|Uniformity_of_Cell_Size|
+-----------------------+
|                      1|
|                      6|
|                      3|
|                      5|
|                      9|
|                      4|
|                      8|
|                      7|
|                     

In [113]:
#Convert "Class" column to binary (1 if 4, else 0)
df = df.withColumn("Class", when(col("Class") == 4, 1).otherwise(0))

#Replace "?" with NULL in "Bare_Nuclei"
df = df.withColumn("Bare_Nuclei", when(col("Bare_Nuclei") == "?", None).otherwise(col("Bare_Nuclei")))

#Convert "Bare_Nuclei" to integer (after replacing "?" with NULL)
df = df.withColumn("Bare_Nuclei", col("Bare_Nuclei").cast("int"))

#Verify transformations
df.select("Bare_Nuclei", "Class").show(10)

+-----------+-----+
|Bare_Nuclei|Class|
+-----------+-----+
|          1|    0|
|         10|    0|
|          2|    0|
|          4|    0|
|          1|    0|
|         10|    1|
|         10|    0|
|          1|    0|
|          1|    0|
|          1|    0|
+-----------+-----+
only showing top 10 rows



In [114]:
df.select("Bare_Nuclei").distinct().show()

+-----------+
|Bare_Nuclei|
+-----------+
|          1|
|          6|
|          3|
|          5|
|          9|
|          4|
|          8|
|          7|
|         10|
|          2|
|       NULL|
+-----------+



In [115]:
column_names = [
    'id_number', 'Clump_Thickness', 'Uniformity_of_Cell_Size', 'Uniformity_of_Cell_Shape',
    'Marginal_Adhesion', 'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin',
    'Normal_Nucleoli', 'Mitoses', 'Class'
]

df = df.toDF(*column_names)

df.show(5)

+---------+---------------+-----------------------+------------------------+-----------------+---------------------------+-----------+---------------+---------------+-------+-----+
|id_number|Clump_Thickness|Uniformity_of_Cell_Size|Uniformity_of_Cell_Shape|Marginal_Adhesion|Single_Epithelial_Cell_Size|Bare_Nuclei|Bland_Chromatin|Normal_Nucleoli|Mitoses|Class|
+---------+---------------+-----------------------+------------------------+-----------------+---------------------------+-----------+---------------+---------------+-------+-----+
|  1000025|              5|                      1|                       1|                1|                          2|          1|              3|              1|      1|    0|
|  1002945|              5|                      4|                       4|                5|                          7|         10|              3|              2|      1|    0|
|  1015425|              3|                      1|                       1|                1| 

In [116]:
# Count NULL values in each column
null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
null_counts.show()

+---------+---------------+-----------------------+------------------------+-----------------+---------------------------+-----------+---------------+---------------+-------+-----+
|id_number|Clump_Thickness|Uniformity_of_Cell_Size|Uniformity_of_Cell_Shape|Marginal_Adhesion|Single_Epithelial_Cell_Size|Bare_Nuclei|Bland_Chromatin|Normal_Nucleoli|Mitoses|Class|
+---------+---------------+-----------------------+------------------------+-----------------+---------------------------+-----------+---------------+---------------+-------+-----+
|        0|              0|                      0|                       0|                0|                          0|         16|              0|              0|      0|    0|
+---------+---------------+-----------------------+------------------------+-----------------+---------------------------+-----------+---------------+---------------+-------+-----+



In [117]:
#Compute Median (50th percentile) for "Bare_Nuclei"
median_value = df.approxQuantile("Bare_Nuclei", [0.5], 0.0)[0]  # 0.5 = 50% quantile

#Replace NULL values with Median
df = df.fillna({"Bare_Nuclei": median_value})

#Show updated DataFrame
df.show(5)

+---------+---------------+-----------------------+------------------------+-----------------+---------------------------+-----------+---------------+---------------+-------+-----+
|id_number|Clump_Thickness|Uniformity_of_Cell_Size|Uniformity_of_Cell_Shape|Marginal_Adhesion|Single_Epithelial_Cell_Size|Bare_Nuclei|Bland_Chromatin|Normal_Nucleoli|Mitoses|Class|
+---------+---------------+-----------------------+------------------------+-----------------+---------------------------+-----------+---------------+---------------+-------+-----+
|  1000025|              5|                      1|                       1|                1|                          2|          1|              3|              1|      1|    0|
|  1002945|              5|                      4|                       4|                5|                          7|         10|              3|              2|      1|    0|
|  1015425|              3|                      1|                       1|                1| 

In [118]:
feature_columns = df.columns[1:-1] 
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

df_transformed = assembler.transform(df)
df_transformed.select("features", "Class").show(5)

+--------------------+-----+
|            features|Class|
+--------------------+-----+
|[5.0,1.0,1.0,1.0,...|    0|
|[5.0,4.0,4.0,5.0,...|    0|
|[3.0,1.0,1.0,1.0,...|    0|
|[6.0,8.0,8.0,1.0,...|    0|
|[4.0,1.0,1.0,3.0,...|    0|
+--------------------+-----+
only showing top 5 rows



In [119]:
train_df, test_df = df_transformed.randomSplit([0.8, 0.2], seed=42)

lr = LogisticRegression(featuresCol="features", labelCol="Class", maxIter=10)
lr_model = lr.fit(train_df)

predictions = lr_model.transform(test_df)

evaluator = BinaryClassificationEvaluator(labelCol="Class")
auc = evaluator.evaluate(predictions)
print(f"Model AUC Score: {auc:.4f}")

predictions.select("features", "Class", "prediction").show(5)

25/03/19 19:04:16 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


Model AUC Score: 0.9997
+--------------------+-----+----------+
|            features|Class|prediction|
+--------------------+-----+----------+
|[10.0,4.0,7.0,2.0...|    1|       1.0|
|[8.0,10.0,10.0,8....|    1|       1.0|
|[5.0,8.0,8.0,10.0...|    1|       1.0|
|[5.0,3.0,2.0,8.0,...|    1|       1.0|
|[10.0,4.0,4.0,10....|    1|       1.0|
+--------------------+-----+----------+
only showing top 5 rows



In [121]:
#Compute Confusion Matrix
conf_matrix = predictions.groupBy("Class", "prediction").count().orderBy("Class", "prediction")
conf_matrix.show()

+-----+----------+-----+
|Class|prediction|count|
+-----+----------+-----+
|    0|       0.0|   61|
|    0|       1.0|    1|
|    1|       0.0|    1|
|    1|       1.0|   49|
+-----+----------+-----+



In [123]:
#Compute Correct Predictions (TP + TN)
correct_predictions = predictions.filter(col("Class") == col("prediction")).count()

#Compute Total Predictions
total_predictions = predictions.count()

#Calculate Accuracy
accuracy = correct_predictions / total_predictions
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 0.9821
