# **Classification with Logistic Regression**
## **3.1.3 Low-Level Operations**

### **1, Parse dataset and create RDD**

In [None]:
from pyspark import SparkContext
import math
import numpy as np
import csv
from io import StringIO

spark = SparkContext.getOrCreate()

raw_data = spark.textFile("creditcard.csv")
header = raw_data.first()
data = raw_data.filter(lambda row: row != header)

parsed_data = data.map(lambda line: next(csv.reader(StringIO(line))))

rdd_data = parsed_data.map(lambda fields: (
    float(fields[-1]), # class
    [float(x) for x in fields[1:-1]] # features
))

# label, features = rdd_data.first()
# print("\nSample Record:")
# print("Label:", label)
# print("First 5 Features:", features[:29])
# print("Total Number of Features:", len(features))

feature_length = len(rdd_data.take(1)[0][1])
weights = [0.0] * feature_length
learning_rate = 0.0001
iterations = 100


def dot_product(w, x):
    return sum(wi * xi for wi, xi in zip(w, x))

# sigmoid function to convert score to probability
def sigmoid(z):
    return 1 / (1 + np.exp(-np.clip(z, -500, 500)))

def compute_gradient(label, features, weights):
    prediction = sigmoid(dot_product(weights, features))
    error = prediction - label
    return [error * f for f in features]

for i in range(iterations):
    gradients = rdd_data.map(lambda x: compute_gradient(x[0], x[1], weights))
    
    # average gradients over all records
    total_gradient = gradients.reduce(lambda a, b: [x + y for x, y in zip(a, b)])
    count = rdd_data.count()
    avg_gradient = [g / count for g in total_gradient]
    
    # weight update: w = w - learning_rate * gradient
    weights = [w - learning_rate * g for w, g in zip(weights, avg_gradient)]
    
    # if i % 10 == 0:
    #     print(f"Iteration {i}: First 5 Weights = {weights[:5]}")

# predict labels using the final weights
def predict_label(features, weights):
    prob = sigmoid(dot_product(weights, features))
    return 1 if prob >= 0.5 else 0

# compare low-level predictions to true labels
predictions = rdd_data.map(lambda x: (x[0], predict_label(x[1], weights)))

# evaluate prediction performance with accuracy
correct = predictions.filter(lambda x: x[0] == x[1]).count()
total = predictions.count()
accuracy = correct / total

print(f"\nFinal Accuracy: {accuracy:.4f}")

                                                                                

Iteration 0: First 5 Weights = [np.float64(-8.243472362715679e-07), np.float64(6.26002459972985e-07), np.float64(-1.2149891947561749e-06), np.float64(7.846290011748039e-07), np.float64(-5.443695709306958e-07)]


                                                                                

Iteration 10: First 5 Weights = [np.float64(-4.3636329131608646e-05), np.float64(-0.00010302810510938469), np.float64(-3.172548015546557e-05), np.float64(1.1821190342231535e-05), np.float64(-6.250391452532119e-05)]


                                                                                

Iteration 20: First 5 Weights = [np.float64(-8.345356183566156e-05), np.float64(-0.0002186152784890349), np.float64(-5.7503318323527295e-05), np.float64(1.9473942054053385e-05), np.float64(-0.00013047484349576813)]


                                                                                

Iteration 30: First 5 Weights = [np.float64(-0.00012040859063773709), np.float64(-0.0003357336700380298), np.float64(-8.020200652298208e-05), np.float64(2.5557968131560203e-05), np.float64(-0.000199337867972526)]


                                                                                

Iteration 40: First 5 Weights = [np.float64(-0.00015519229214003863), np.float64(-0.00045281820453789255), np.float64(-0.00010068705562496313), np.float64(3.065613262314166e-05), np.float64(-0.00026825664664858134)]


                                                                                

Iteration 50: First 5 Weights = [np.float64(-0.00018826427222649902), np.float64(-0.0005693873354399155), np.float64(-0.00011945224579988105), np.float64(3.504986169898816e-05), np.float64(-0.0003369597224695631)]


                                                                                

Iteration 60: First 5 Weights = [np.float64(-0.0002199418457007569), np.float64(-0.0006852651071207733), np.float64(-0.00013681807496669483), np.float64(3.8903164454774705e-05), np.float64(-0.00040534335990003486)]


                                                                                

Iteration 70: First 5 Weights = [np.float64(-0.00025045470961157073), np.float64(-0.0008003887905478007), np.float64(-0.00015300975693150425), np.float64(4.2322638599734656e-05), np.float64(-0.0004733682524082145)]


                                                                                

Iteration 80: First 5 Weights = [np.float64(-0.00027997562422709543), np.float64(-0.0009147439271618026), np.float64(-0.00016819420667226162), np.float64(4.538270617694844e-05), np.float64(-0.0005410232956085673)]


                                                                                

Iteration 90: First 5 Weights = [np.float64(-0.00030863852366535107), np.float64(-0.0010283381974567271), np.float64(-0.0001824999627461265), np.float64(4.8138023945245084e-05), np.float64(-0.0006083105968712515)]





Final Accuracy: 0.9967


                                                                                

### **Explanations:**
- Learning rate at 0.0001: small enough due to the high-dimensional scattered data. Larger values may cause divergence.
- Iteration count 100: enough iteration to see convergence trends and reasonable accuracy without overfitting or too long computation time.
### **Challenges:**
- Implementing gradient descent manually using RDD operations require careful attention to broadcasting weights and avoiding driver-only updates.
- Parsing and managing numerical data from text-based CSV format required custom parsing and float conversion.
- Ensuring the sigmoid function didn’t overflow required care with extreme dot product values.