# **Classification with Logistic Regression**

## **3.1.1 Structured API Implementation (High-Level)**

### **1. Data preparation**

In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("CreditCardFraud").getOrCreate()

data = spark.read.csv("creditcard.csv", header=True, inferSchema=True)

### **2. Pre-processing and splitting data into train and test data**

In [8]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col
from pyspark.sql.functions import rand

# filter class
class_1_df = data.filter(col("Class") == 1)
class_0_df = data.filter(col("Class") == 0)

# count class value 1
count_1 = class_1_df.count()
# random with simlar amount in class value 0
balanced_0_df = class_0_df.sample(False, fraction=(count_1 / class_0_df.count()), seed=2505)

balanced_df = balanced_0_df.union(class_1_df)
balanced_df = balanced_df.orderBy(rand())

input_columns = [col_name for col_name in balanced_df.columns if col_name != "Class"]

data = VectorAssembler(inputCols=input_columns, outputCol="Features") \
           .transform(balanced_df).select("Features", "Class")

train_data, test_data = data.randomSplit([0.75, 0.25], seed=2505)

### **3. Train the Logistic Regression model using MLlib:**

In [9]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="Features", labelCol="Class")

model = lr.fit(train_data)

predictions = model.transform(test_data)



### **4. Evaluate the obtained model**

In [10]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

accuracy = MulticlassClassificationEvaluator(labelCol="Class", metricName="accuracy").evaluate(predictions)

auc = BinaryClassificationEvaluator(labelCol="Class", metricName="areaUnderROC").evaluate(predictions)

precision = MulticlassClassificationEvaluator(labelCol="Class", metricName="weightedPrecision").evaluate(predictions)

recall = MulticlassClassificationEvaluator(labelCol="Class", metricName="weightedRecall").evaluate(predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")



Accuracy: 0.9437
AUC: 0.9816
Precision: 0.9438
Recall: 0.9437


                                                                                