# W261 Final Project ETL for Development Sample

### Notebook Set-Up

In [1]:
# imports
import time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from pyspark.sql import Row
from pyspark.ml.feature import CountVectorizer
from pyspark.sql import DataFrame

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
# store path to notebook
PWD = !pwd
PWD = PWD[0]
PWD

'/media/notebooks'

In [4]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "w261FinalProject"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

__`REMINDER:`__ If you are running this notebook on the course docker container, you can monitor the progress of your jobs using the Spark UI at: http://localhost:4040/jobs/

## Load Data

In [5]:
original_trainRDD = sc.textFile('data/train.txt')
original_testRDD = sc.textFile('data/test.txt')

## Transformation

### Sample

In [6]:
#change the seed for a different sample
sampleRDD1, sampleRDD2 = original_trainRDD.randomSplit([0.99995,0.00005], seed = 1)
sampleRDD2.cache()

PythonRDD[4] at RDD at PythonRDD.scala:49

In [7]:
ncol = len(sampleRDD2.take(1)[0].split('\t'))
nrow = sampleRDD2.count()
print("This sample contains", str(nrow), "rows.")

This sample contains 2248 rows.


In [8]:
print("This sample contains", str(ncol), "columns.")

This sample contains 40 columns.


In [9]:
def avgFeatures(line):
    
    count = 0
    feats = line.split('\t')[1:]
    
    for feat in feats:
        if feat != '':
            count += 1

    return count

print("There is an average of", str(round(sampleRDD2.map(avgFeatures).mean(),2)), "populated features per observation.")

There is an average of 33.52 populated features per observation.


# Put in wide, sparse feature format

In [10]:
def parseCV(line):
    """
    Map record_csv_string --> (features, label)
    """

    # start of categorical features
    col_start = 14
    
    raw_values = line.split('\t')
    label = int(raw_values[0])
    
    # ignore numerics to start
    #numerical_values = list(pd.Series(raw_values[1:14]).apply(pd.to_numeric))
    numericals = []
    for idx, value in enumerate(raw_values[1:col_start]):
        if value != '':
            numericals.append('n' + str(idx) + '_' + str(value))
            
    
    categories = []
    for idx, value in enumerate(raw_values[col_start:]):
        if value != '':
            categories.append('c'+ str(idx) + '_' + str(value))

    return Row(label=label, raw=numericals + categories)


def vectorizeCV(DF):
    
    vectorizer = CountVectorizer()
    cv = CountVectorizer(inputCol="raw", outputCol="features", binary=True)
    
    model = cv.fit(DF)
    result = model.transform(DF)
    num_feats = cv.getVocabSize()
    
    return result


parsedDF = sampleRDD2.map(parseCV).toDF().cache()
vectorizedDF = vectorizeCV(parsedDF)

In [11]:
vectorizedDF.show(truncate=True)

+-----+--------------------+--------------------+
|label|                 raw|            features|
+-----+--------------------+--------------------+
|    0|[n1_4, n2_50, n3_...|(18545,[0,1,2,4,5...|
|    0|[n1_12, n2_20, n3...|(18545,[0,1,2,5,1...|
|    0|[n0_8, n1_17, n3_...|(18545,[0,1,4,12,...|
|    0|[n0_0, n1_144, n4...|(18545,[0,2,3,4,5...|
|    0|[n1_0, n2_5, n4_3...|(18545,[0,2,3,6,1...|
|    0|[n1_323, n2_2, n3...|(18545,[1,2,14,16...|
|    0|[n0_0, n1_424, n3...|(18545,[0,1,2,4,6...|
|    0|[n0_0, n1_13, n2_...|(18545,[0,1,2,5,6...|
|    0|[n1_180, n2_6, n3...|(18545,[1,2,8,14,...|
|    0|[n1_21, n2_3, n3_...|(18545,[1,2,6,10,...|
|    0|[n1_3, n2_7, n3_1...|(18545,[0,1,2,5,8...|
|    0|[n1_1, n2_2, n4_3...|(18545,[0,5,9,14,...|
|    0|[n0_0, n1_2, n4_3...|(18545,[0,2,5,6,1...|
|    1|[n1_-1, n4_26295,...|(18545,[0,2,5,9,2...|
|    0|[n0_0, n1_0, n2_1...|(18545,[0,1,5,7,9...|
|    1|[n0_1, n1_196, n3...|(18545,[0,1,2,3,5...|
|    0|[n0_0, n1_-1, n4_...|(18545,[0,1,3,4,5...|


In [12]:
vectorizedRDD = vectorizedDF.select(['label', 'features']).rdd.cache()
toyRDD = vectorizedRDD.take(1)  #for toy SGD loop
toyRDD

[Row(label=0, features=SparseVector(18545, {0: 1.0, 1: 1.0, 2: 1.0, 4: 1.0, 5: 1.0, 7: 1.0, 10: 1.0, 21: 1.0, 33: 1.0, 122: 1.0, 161: 1.0, 178: 1.0, 209: 1.0, 320: 1.0, 454: 1.0, 473: 1.0, 506: 1.0, 592: 1.0, 606: 1.0, 615: 1.0, 1431: 1.0, 1469: 1.0, 1497: 1.0, 1507: 1.0, 2919: 1.0, 6655: 1.0, 7005: 1.0, 7892: 1.0, 10798: 1.0, 14292: 1.0, 15119: 1.0, 15309: 1.0, 15738: 1.0, 16417: 1.0}))]

In [13]:
# feature data struct tester
num_feats = vectorizedRDD.take(1)[0][1].size
print("Number of total expanded features:", num_feats)

Number of total expanded features: 18545


In [14]:
print("Percent of data in the positive class:")
vectorizedRDD.map(lambda x: x[0]).mean()

Percent of data in the positive class:


0.254003558718861

## Develop Probability and Gradient Estimation Function

In [17]:
# initialize weights
np.random.seed(24)

b_br = sc.broadcast(0.0)
w_br = sc.broadcast(np.random.normal(0.0, 0.02, (1, num_feats)))
k_br = sc.broadcast(2)
V_br = sc.broadcast(np.random.normal(0.0, 0.02, (k_br.value, num_feats)))

# tester
#V_br.value[1][[1,2,1000]]
print(w_br.value.shape)
print(V_br.value.shape)

(1, 18545)
(2, 18545)


In [18]:
def predict_grad(pair):
    """
        Compute the predicted probability AND return the gradient (?)
        Args:
            pair - records are in (label, sparse feature set) format
        Broadcast:
            b - bias term (scalar)
            w - linear weight vector (array)
            k - number of factors (def=2)
            V - factor matrix of size (d dimensions, k=2 factors)
        Returns:
            predRDD - pair of ([label, predicted probability], [set of weight vectors in csr_matrix format])
    """
    
    label = pair[0]
    feats = pair[1]
    
    # start with linear weight dot product
    linear_sum = np.dot(w_br.value[0][feats.indices], feats.values)

    # factor matrix interaction sum
    factor_sum = 0.0
    lh_factor = [0.0]*k_br.value
    rh_factor = [0.0]*k_br.value
    
    for f in range(0, k_br.value):
        lh_factor[f] = np.dot(V_br.value[f][feats.indices], feats.values)  #KEY--this is used in v_grad matrix below
        rh_factor[f] = np.dot(V_br.value[f][feats.indices]**2, feats.values**2)
        factor_sum += (lh_factor[f]**2 - rh_factor[f])
    factor_sum = 0.5 * factor_sum
    
    pre_prob = b_br.value + linear_sum + factor_sum
    
    prob = 1.0 / (1 + np.exp(-pre_prob))  #logit transformation
    
    #compute Gradients
    b_grad = prob - label
    
    w_grad = csr_matrix((b_grad*feats.values, feats.indices, np.array([0, feats.indices.size])), (1, w_br.value.shape[0]))
    #print(w_grad[(0,5)])
    
    v_grad = csr_matrix((V_br.value.shape[0], V_br.value.shape[1]))
    for f in range(0, k_br.value):  # WORKING
        for i in feats.indices:
            i = int(i)
            v_grad[(f,i)] = b_grad * (feats[i]*lh_factor[f] - V_br.value[f][i]*(feats[i]**2))
    #print(type(v_grad))
    
    
    return ([label, prob], [b_grad, w_grad, v_grad])

In [2]:
test_csr = csr_matrix(np.array([0,1,0,2,0,6]).reshape(2,3))
test_csr[(1,1)] = 19
print(test_csr)

NameError: name 'csr_matrix' is not defined

In [20]:
# toy example
tester_grads = predict_grad(vectorizedRDD.take(1)[0])
sparse_mtx = tester_grads[1][2]
sparse_mtx[(0,5)]



0.08589947318787691

In [23]:
predRDD = vectorizedRDD.map(predict_grad).cache()
test_pred = predRDD.take(1)
print(test_pred)

[([0, 0.4800103589310691], [0.4800103589310691, <1x1 sparse matrix of type '<class 'numpy.float64'>'
	with 34 stored elements in Compressed Sparse Row format>, <2x18545 sparse matrix of type '<class 'numpy.float64'>'
	with 68 stored elements in Compressed Sparse Row format>])]


## Get log-loss with regularization on the sample set

In [24]:
def logLoss(pair):
    """parallelize log loss"""
    y = pair[0][0]
    
    eps = 1.0e-16
    if pair[0][1] == 0:
        y_hat = eps
    elif pair[0][1] == 1:
        y_hat = 1-eps
    else:
        y_hat = pair[0][1]
    
    return -(y * np.log(y_hat) + (1-y) * np.log(1-y_hat))


In [25]:
useReg = True
regParam = .001

sample_loss = predRDD.map(logLoss).mean() + int(useReg)*(regParam/2)*(np.linalg.norm(w_br.value)**2 + np.linalg.norm(V_br.value)**2)
sample_loss

0.7141455838302341

# Update weight vectors by average gradients with regularization (optional)
### One iteration only

In [1]:
learningRate = 0.1

############## gradient calculation ##############
# calculate average gradient for b
bGrad = predRDD.map(lambda x: x[1][0]).mean()

# calculate average gradient for w
N = predRDD.count()
wGrad = (1/N) * predRDD.map(lambda x: x[1][1]).reduce(lambda a,b: a+b) 
print("wGrad shape:", wGrad.shape)
print(type(wGrad))

# calculate average gradient for V
vGrad = (1/N) * predRDD.map(lambda x: x[1][2]).reduce(lambda a,b: a+b)
print("vGrad shape:", vGrad.shape)
print(type(vGrad))

############## update weights ##############
# first, unpersist broadcasts
b_br.unpersist()
w_br.unpersist()
V_br.unpersist()

print("w_br shape:", w_br.value.shape)
print(type(w_br.value))

# update
b_br = sc.broadcast(b_br.value - learningRate * bGrad)
w_br = sc.broadcast(w_br.value - learningRate * wGrad.toarray())
V_br = sc.broadcast(V_br.value - learningRate * vGrad.toarray())

NameError: name 'predRDD' is not defined

In [None]:
print("Bias:", b_br.value)
print("W shape:", w_br.value.shape)
print("V shape:", V_br.value.shape)

print("W e.g.:", w_br.value[0][10])
print("V e.g.:", V_br.value[1][10])

# Iterate over Model

## Toy example

In [None]:
print(csr_matrix([1,3]))

In [None]:
def predict_toy(pair, b, w, V, k=2):
    """
        Compute the predicted probability AND return the gradient (?)
        Args:
            pair - records are in (label, sparse feature set) format
        Broadcast:
            b - bias term (scalar)
            w - linear weight vector (array)
            k - number of factors (def=2)
            V - factor matrix of size (d dimensions, k=2 factors)
        Returns:
            predRDD - pair of ([label, predicted probability], [set of weight vectors in csr_matrix format])
    """
    
    label = pair[0]
    feats = pair[1]
    
    # start with linear weight dot product
    print("w:", w.shape)
    linear_sum = np.dot(w[0][feats.indices], feats.values)

    # factor matrix interaction sum
    factor_sum = 0.0
    lh_factor = [0.0]*k
    rh_factor = [0.0]*k
    
    for f in range(0, k):
        lh_factor[f] = np.dot(V[f][feats.indices], feats.values)  #KEY--this is used in v_grad matrix below
        rh_factor[f] = np.dot(V[f][feats.indices]**2, feats.values**2)
        factor_sum += (lh_factor[f]**2 - rh_factor[f])
    factor_sum = 0.5 * factor_sum
    
    pre_prob = b + linear_sum + factor_sum
    
    prob = 1.0 / (1 + np.exp(-pre_prob))  #logit transformation
    
    #compute Gradients
    b_grad = prob - label
    
    w_grad = csr_matrix((b_grad*feats.values, feats.indices, np.array([0, feats.indices.size])), (1, w.shape[0]))
    print("w_grad:", w_grad.shape)
    
    print("V:", V.shape)
    v_grad = csr_matrix((V.shape[0], V.shape[1]))
    for f in range(0, k):  # WORKING
        for i in feats.indices:
            i = int(i)
            v_grad[(f,i)] = b_grad * (feats[i]*lh_factor[f] - V[f][i]*(feats[i]**2))
    print(v_grad.shape)
    
    return ([label, prob], [b_grad, w_grad, v_grad])

In [None]:
b = 0.0
w = np.random.normal(0.0, 0.02, (1, num_feats))
k = 2
V = np.random.normal(0.0, 0.02, (k, num_feats))


for i in range(0,k):
    predictions = predict_toy(toyRDD[0], b, w, V, k)
    b_grad = predictions[1][0]
    w_grad = predictions[1][1]
    print("wgrad:", w_grad.shape)
    V_grad = predictions[1][2]
    print(V_grad.shape)

    b=b - b_grad
    w=w - w_grad.toarray()
    V=V - V_grad.toarray()
    print(i, b)
    print(i, w.shape)
    print(i, V.shape)

## Sample RDD

In [None]:
np.random.seed(24)

b_br = sc.broadcast(0.0)
w_br = sc.broadcast(np.random.normal(0.0, 0.02, (1, num_feats)))
k_br = sc.broadcast(2)
V_br = sc.broadcast(np.random.normal(0.0, 0.02, (k_br.value, num_feats)))


nIter = 2
learningRate = 0.1
useReg = False
regParam = .001
losses = []

for i in range(nIter):
    predRDD = vectorizedRDD.map(predict_grad).cache()
    loss = predRDD.map(logLoss).mean() #+ int(useReg)*(regParam/2)*(np.linalg.norm(w_br.value)**2 + np.linalg.norm(V_br.value)**2)
    losses.append(loss)
    print(i, "Current log-loss:", loss)
    
    # calculate average gradient for b
    bGrad = predRDD.map(lambda x: x[1][0]).mean()
    print("Bias:", bGrad)

    # calculate average gradient for w
    N = predRDD.count()
    wGrad = (1/N) * predRDD.map(lambda x: x[1][1]).reduce(lambda a,b: a+b)
    print("wGrad shape:", wGrad.shape)
    print(type(wGrad))

    # calculate average gradient for V
    vGrad = (1/N) * predRDD.map(lambda x: x[1][2]).reduce(lambda a,b: a+b)
    print("vGrad shape:", vGrad.shape)
    print(type(vGrad))

    ############## update weights ##############
    # first, unpersist broadcasts
    #predRDD.unpersist()
    b_br.unpersist()
    w_br.unpersist()
    V_br.unpersist()

    # update
    b_br = sc.broadcast(b_br.value - learningRate * bGrad)
    w_br = sc.broadcast(w_br.value - learningRate * wGrad.toarray())  # add regularization terms here conditional on useReg
    V_br = sc.broadcast(V_br.value - learningRate * vGrad.toarray())  # add regularization terms here conditional on useReg
    
    print(i, "Bias:", b_br.value)
    print(i, "W shape:", w_br.value.shape)
    print(i, "W shape:", type(w_br.value))
    print(i, "V shape:", V_br.value.shape)
    print(i, "V shape:", type(V_br.value))

In [None]:
losses