##Prepare the schema

In [0]:
from pyspark.sql import types as T

schema = T.StructType([
    T.StructField("ID", T.LongType(), True),
    T.StructField("pkgname", T.StringType(), True),
    T.StructField("DevRegisteredDomain", T.LongType(), True),
    T.StructField("LenDescription", T.LongType(), True),
    T.StructField("LenWhatsNew", T.LongType(), True),
    T.StructField("ReviewsAverage", T.DoubleType(), True),
    T.StructField("CurrentVersion", T.StringType(), True),
    T.StructField("Genre", T.StringType(), True),
    T.StructField("ContentRating", T.StringType(), True),
    T.StructField("LastUpdated", T.LongType(), True),
    T.StructField("LenTitle", T.LongType(), True),
    T.StructField("AndroidVersion", T.StringType(), True),
    T.StructField("DeveloperCategory", T.StringType(), True),
    T.StructField("isSpamming", T.LongType(), True),
    T.StructField("net", T.LongType(), True),
    T.StructField("intent", T.LongType(), True),
    T.StructField("bluetooth", T.LongType(), True),
    T.StructField("app", T.LongType(), True),
    T.StructField("provider", T.LongType(), True),
    T.StructField("speech", T.LongType(), True),
    T.StructField("nfc", T.LongType(), True),
    T.StructField("media", T.LongType(), True),
    T.StructField("hardware", T.LongType(), True),
    T.StructField("google", T.LongType(), True),
    T.StructField("os", T.LongType(), True),
    T.StructField("CALENDAR", T.LongType(), True),
    T.StructField("CAMERA", T.LongType(), True),
    T.StructField("CONTACTS", T.LongType(), True),
    T.StructField("LOCATION", T.LongType(), True),
    T.StructField("MICROPHONE", T.LongType(), True),
    T.StructField("PHONE", T.LongType(), True),
    T.StructField("SENSORS", T.LongType(), True),
    T.StructField("SMS", T.LongType(), True),
    T.StructField("STORAGE", T.LongType(), True),
    T.StructField("status", T.LongType(), True),
    T.StructField("FourStarRatings", T.DoubleType(), True),
    T.StructField("ThreeStarRatings", T.DoubleType(), True),
    T.StructField("FiveStarRatings", T.DoubleType(), True),
    T.StructField("OneStarRatings", T.DoubleType(), True),
    T.StructField("TwoStarRatings", T.DoubleType(), True),
    T.StructField("lowest_android_version", T.StringType(), True),
    T.StructField("highest_android_version", T.StringType(), True),
    T.StructField("paid", T.LongType(), True),
    T.StructField("file_size", T.LongType(), True),
    T.StructField("max_downloads_log", T.DoubleType(), True),
    T.StructField("developer_email", T.LongType(), True),
    T.StructField("privacy_policy_link", T.LongType(), True),
    T.StructField("developer_address", T.LongType(), True),
    T.StructField("developer_website", T.LongType(), True),
    T.StructField("days_since_last_update", T.LongType(), True),
    T.StructField("malicious_count", T.LongType(), True),
    T.StructField("undetected_count", T.LongType(), True),
    T.StructField("certificate_life_days", T.LongType(), True),
    T.StructField("file_duration_days", T.LongType(), True),
    T.StructField("times_submitted", T.LongType(), True),
    T.StructField("threat_level", T.StringType(), True),
    T.StructField("aggregated_risk_score", T.LongType(), True),
    T.StructField("weighted_conf_sum", T.DoubleType(), True),
    T.StructField("permission_n", T.LongType(), True),
    T.StructField("avg_weight", T.DoubleType(), True),
    T.StructField("verdict", T.StringType(), True),
])


##Read the file

In [0]:
df = spark.read.csv("dbfs:/FileStore/BDA_Datasets/val_dataset_t1.csv", schema=schema, header=True)

##Import necessary libraries and perform preprocessing steps including normalization

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator
# ---------------------------
# Label
# ---------------------------
label_col = "verdict"

label_indexer = StringIndexer(
    inputCol=label_col,
    outputCol="label",
    handleInvalid="keep"
)

# ---------------------------
# Numeric feature list
# ---------------------------
numeric_features = [
    "DevRegisteredDomain", "LenDescription", "LenWhatsNew", "LenTitle",
    "LastUpdated", "days_since_last_update", "file_size", "max_downloads_log",
    "file_duration_days", "certificate_life_days", "times_submitted",
    "aggregated_risk_score", "weighted_conf_sum", "avg_weight",
    "malicious_count", "undetected_count", "FourStarRatings",
    "ThreeStarRatings", "FiveStarRatings", "OneStarRatings",
    "TwoStarRatings", "paid", "permission_n",

    # Permission features
    "net", "intent", "bluetooth", "app", "provider", "speech", "nfc",
    "media", "hardware", "google", "os", "CALENDAR", "CAMERA",
    "CONTACTS", "LOCATION", "MICROPHONE", "PHONE", "SENSORS",
    "SMS", "STORAGE"
]

# ---------------------------
# Vector Assembler
# ---------------------------
assembler = VectorAssembler(
    inputCols=numeric_features,
    outputCol="features_raw", handleInvalid="skip"
)

# ---------------------------
# Standard Scaler
# ---------------------------
scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features",
    withMean=False,  # MLPC requirement
    withStd=True
)

In [0]:
# ---------------------------
# MLP Classifier
# ---------------------------
input_dim = len(numeric_features)

# Determine number of label classes automatically later
# but you can also specify manually if needed.
mlp = MultilayerPerceptronClassifier(
    featuresCol="features",
    labelCol="label",
    maxIter=100,
    layers=[input_dim, 16, 16, 2],  # adapt output size if >2 classes
    blockSize=128,
    seed=42
)

# ---------------------------
# Pipeline
# ---------------------------
pipeline = Pipeline(stages=[
    label_indexer,
    assembler,
    scaler,
    mlp
])

preprocessor_pipeline = Pipeline(stages=[label_indexer,
    assembler,
    scaler]) # for a later experiment. FYI. 
    

# ---------------------------
# Fit model
# ---------------------------
model = pipeline.fit(df)

# ---------------------------
# Predict
# ---------------------------
predictions = model.transform(df)
predictions.select("verdict", "label", "prediction").show(truncate=False)


In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Accuracy = %g " % (accuracy))

#What about parameter tuning through CV?

In [0]:
from pyspark.ml.tuning import ParamGridBuilder

#before the following steps, I need to make sure I work with the preprocessed data to avoid multiple runs of preprocessing. 

transformedData = preprocessor_pipeline.fit(df).transform(df)

output_classes = 2 

# Define layer configurations to test
layer_options = [
    # Option 1: One wider hidden layer (e.g., 32 nodes)
    [input_dim, 32, output_classes],
    # Option 2: Two narrower hidden layers  
    [input_dim, 16, 16, output_classes],
    # Option 3: Two wider hidden layers (e.g., 32 nodes)
    [input_dim, 32, 32, output_classes],
]

# Create the parameter grid
paramGrid = (ParamGridBuilder()
    .addGrid(mlp.layers, layer_options) # Test different layer structures
    .addGrid(mlp.maxIter, [50, 100])    
    .build()
)

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator # Or BinaryClassificationEvaluator

# Create an Evaluator (e.g., using F1 score for classification)
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="f1" 
)

# Create a simple Pipeline with just the MLP model
pipeline = Pipeline(stages=[mlp])

In [0]:
from pyspark.ml.tuning import CrossValidator

# Create the CrossValidator instance
cv = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=evaluator, 
    numFolds=5, 
    seed=42
)

# Fit the CrossValidator to your training data
# This step will train and evaluate the model for every combination
# (number of combinations * numFolds) times.
cvModel = cv.fit(transformedData)

# The best model found is automatically selected
best_model = cvModel.bestModel