In [1]:
import pandas as pd
import numpy as np

In [2]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "hw4_notebook"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

In [3]:
#make up some fake data. If < 5 then 0 if > 5 then 1
X = np.random.randint(1,10,100)
X.shape

(100,)

In [4]:
X

array([8, 9, 8, 1, 1, 9, 6, 2, 2, 9, 4, 2, 2, 5, 2, 6, 4, 2, 9, 5, 7, 5,
       2, 6, 9, 7, 8, 7, 2, 8, 5, 5, 4, 6, 9, 8, 6, 9, 6, 1, 4, 4, 9, 6,
       7, 5, 4, 9, 3, 3, 1, 8, 4, 7, 9, 6, 4, 4, 7, 3, 7, 8, 5, 9, 2, 6,
       2, 6, 8, 5, 2, 1, 5, 7, 8, 6, 2, 3, 9, 1, 9, 4, 8, 6, 7, 8, 9, 5,
       4, 1, 8, 9, 2, 3, 5, 9, 4, 7, 6, 9])

In [6]:
y = np.where(X < 5,0,1)
y.shape

(100,)

In [8]:
y

array([1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1])

In [20]:
#conbine for spark
combined = np.vstack((X,y)).T

In [21]:
#feed into spark
dummyRDD = sc.parallelize(combined)

In [23]:
# Generate 80/20 (pseudo)random train/test split - RUN THIS CELL AS IS
trainRDD, heldOutRDD = dummyRDD.randomSplit([0.8,0.2], seed = 1)
print(f"... held out {heldOutRDD.count()} records for evaluation and assigned {trainRDD.count()} for training.")

... held out 31 records for evaluation and assigned 69 for training.


In [24]:
#define loss based on sigmoid activation
def logloss(dataRDD, W):
    augmentedData = dataRDD.map(lambda x: (np.append([1.0], x[0]), x[1]))
    loss = (augmentedData.
            map(lambda x: (np.clip(1 / (1 + np.exp(-1*W @ x[0])), 1e-15, 1.0 - 1e-15), x[1])).
            map(lambda x: -1*np.log(x[0]) if x[1]==1.0 else -1*np.log(1-x[0])).mean())
    return loss

In [53]:
def GDUpdate(dataRDD, W, learningRate = 0.05):
    """
    Perform one gradient descent step/update.
    Args:
        dataRDD - records are tuples of (features_array, y)
        W       - (array) model coefficients with bias at index 0
    Returns:
        new_model - (array) updated coefficients, bias at index 0
    """
    # add a bias 'feature' of 1 at index 0
    augmentedData = dataRDD.map(lambda x: (np.append([1.0], x[0]), x[1])).cache()
    
    grad = augmentedData.map(lambda x: sum(x[0]*((1/(1+np.exp(-1*np.dot(W, x[0]))))-x[1]))).mean()
    new_model = W - learningRate * grad 
   
    return new_model

In [35]:
def normalize(dataRDD):
    """
    Scale and center data round mean of each feature.
    Args:
        dataRDD - records are tuples of (features_array, y)
    Returns:
        normedRDD - records are tuples of (features_array, y)
    """
    featureMeans = dataRDD.map(lambda x: x[0]).mean()
    featureStdev = np.sqrt(dataRDD.map(lambda x: x[0]).variance())
    
    ################ YOUR CODE HERE #############
    normedRDD = dataRDD.map(lambda x: (((x[0] - featureMeans)/featureStdev),x[1]))
    ################ FILL IN YOUR CODE HERE #############
    
    return normedRDD

In [36]:
normedRDD = normalize(dummyRDD).cache()

In [51]:
BASELINE = np.array([.5,.5])
logloss(normedRDD,BASELINE)

0.4821362365121533

In [54]:
#iterate updating models
nSteps = 100
model = BASELINE
print(f"BASELINE:  Loss = {logloss(normedRDD,model)}")
for idx in range(nSteps):
    print("----------")
    print(f"STEP: {idx+1}")
    model = GDUpdate(normedRDD, model)
    loss = logloss(normedRDD, model)
    print(f"Loss: {loss}")
    print(f"Model: {[round(w,3) for w in model]}")

BASELINE:  Loss = 0.4821362365121533
----------
STEP: 1
Loss: 0.47734994807407366
Model: [0.516, 0.516]
----------
STEP: 2
Loss: 0.47274905375004195
Model: [0.531, 0.531]
----------
STEP: 3
Loss: 0.4683239447884896
Model: [0.546, 0.546]
----------
STEP: 4
Loss: 0.4640655990000977
Model: [0.56, 0.56]
----------
STEP: 5
Loss: 0.4599655425330241
Model: [0.575, 0.575]
----------
STEP: 6
Loss: 0.4560158139732677
Model: [0.589, 0.589]
----------
STEP: 7
Loss: 0.45220893068932705
Model: [0.603, 0.603]
----------
STEP: 8
Loss: 0.4485378573286763
Model: [0.616, 0.616]
----------
STEP: 9
Loss: 0.4449959763659943
Model: [0.63, 0.63]
----------
STEP: 10
Loss: 0.4415770605987077
Model: [0.643, 0.643]
----------
STEP: 11
Loss: 0.4382752474835591
Model: [0.656, 0.656]
----------
STEP: 12
Loss: 0.43508501520803056
Model: [0.668, 0.668]
----------
STEP: 13
Loss: 0.432001160392051
Model: [0.681, 0.681]
----------
STEP: 14
Loss: 0.4290187773181404
Model: [0.693, 0.693]
----------
STEP: 15
Loss: 0.4261332