# W261 Final Project ETL for Development Sample

### Notebook Set-Up

In [1]:
# imports
import time
import numpy as np
import pandas as pd
from pyspark.sql import Row
from pyspark.ml.feature import CountVectorizer
from pyspark.sql import DataFrame

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
# store path to notebook
PWD = !pwd
PWD = PWD[0]
PWD

'/media/notebooks'

In [4]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "w261FinalProject"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

__`REMINDER:`__ If you are running this notebook on the course docker container, you can monitor the progress of your jobs using the Spark UI at: http://localhost:4040/jobs/

## Load Data

In [5]:
original_trainRDD = sc.textFile('data/train.txt')
original_testRDD = sc.textFile('data/test.txt')

## Transformation

### Sample

In [6]:
#change the seed for a different sample
sampleRDD1, sampleRDD2 = original_trainRDD.randomSplit([0.99995,0.00005], seed = 1)
sampleRDD2.cache()

PythonRDD[4] at RDD at PythonRDD.scala:49

In [7]:
ncol = len(sampleRDD2.take(1)[0].split('\t'))
nrow = sampleRDD2.count()
print("This sample contains", str(nrow), "rows.")

This sample contains 2248 rows.


In [8]:
print("This sample contains", str(ncol), "columns.")

This sample contains 40 columns.


In [9]:
def avgFeatures(line):
    
    count = 0
    feats = line.split('\t')[1:]
    
    for feat in feats:
        if feat != '':
            count += 1

    return count

print("There is an average of", str(round(sampleRDD2.map(avgFeatures).mean(),2)), "populated features per observation.")
sampleRDD2.map(avgFeatures).mean()

There is an average of 33.52 populated features per observation.


33.52179715302491

# Put in wide, sparse feature format

In [10]:
def parseCV(line):
    """
    Map record_csv_string --> (features, label)
    """

    # start of categorical features
    col_start = 14
    
    raw_values = line.split('\t')
    label = int(raw_values[0])
    
    # ignore numerics to start
    #numerical_values = list(pd.Series(raw_values[1:14]).apply(pd.to_numeric))
    numericals = []
    for idx, value in enumerate(raw_values[1:col_start]):
        if value != '':
            numericals.append('n' + str(idx) + '_' + str(value))
            
    
    categories = []
    for idx, value in enumerate(raw_values[col_start:]):
        if value != '':
            categories.append('c'+ str(idx) + '_' + str(value))

    return Row(label=label, raw=numericals + categories)


def vectorizeCV(DF):
    
    vectorizer = CountVectorizer()
    cv = CountVectorizer(inputCol="raw", outputCol="features", binary=True)
    
    model = cv.fit(DF)
    result = model.transform(DF)
    num_feats = cv.getVocabSize()
    
    return result


parsedDF = sampleRDD2.map(parseCV).toDF().cache()
vectorizedDF = vectorizeCV(parsedDF)

In [11]:
vectorizedDF.show(truncate=True)

+-----+--------------------+--------------------+
|label|                 raw|            features|
+-----+--------------------+--------------------+
|    0|[n1_4, n2_50, n3_...|(18545,[0,1,2,4,5...|
|    0|[n1_12, n2_20, n3...|(18545,[0,1,2,5,1...|
|    0|[n0_8, n1_17, n3_...|(18545,[0,1,4,12,...|
|    0|[n0_0, n1_144, n4...|(18545,[0,2,3,4,5...|
|    0|[n1_0, n2_5, n4_3...|(18545,[0,2,3,6,1...|
|    0|[n1_323, n2_2, n3...|(18545,[1,2,14,16...|
|    0|[n0_0, n1_424, n3...|(18545,[0,1,2,4,6...|
|    0|[n0_0, n1_13, n2_...|(18545,[0,1,2,5,6...|
|    0|[n1_180, n2_6, n3...|(18545,[1,2,8,14,...|
|    0|[n1_21, n2_3, n3_...|(18545,[1,2,6,10,...|
|    0|[n1_3, n2_7, n3_1...|(18545,[0,1,2,5,8...|
|    0|[n1_1, n2_2, n4_3...|(18545,[0,5,9,14,...|
|    0|[n0_0, n1_2, n4_3...|(18545,[0,2,5,6,1...|
|    1|[n1_-1, n4_26295,...|(18545,[0,2,5,9,2...|
|    0|[n0_0, n1_0, n2_1...|(18545,[0,1,5,7,9...|
|    1|[n0_1, n1_196, n3...|(18545,[0,1,2,3,5...|
|    0|[n0_0, n1_-1, n4_...|(18545,[0,1,3,4,5...|


In [12]:
vectorizedRDD = vectorizedDF.select(['label', 'features']).rdd.cache()
vectorizedRDD.take(1)

[Row(label=0, features=SparseVector(18545, {0: 1.0, 1: 1.0, 2: 1.0, 4: 1.0, 5: 1.0, 7: 1.0, 10: 1.0, 21: 1.0, 33: 1.0, 122: 1.0, 163: 1.0, 179: 1.0, 209: 1.0, 318: 1.0, 432: 1.0, 474: 1.0, 521: 1.0, 565: 1.0, 607: 1.0, 634: 1.0, 1431: 1.0, 1443: 1.0, 1509: 1.0, 1541: 1.0, 3524: 1.0, 7034: 1.0, 7941: 1.0, 10434: 1.0, 11488: 1.0, 14157: 1.0, 15046: 1.0, 15368: 1.0, 15404: 1.0, 16184: 1.0}))]

In [13]:
# feature data struct tester
num_feats = vectorizedRDD.take(1)[0][1].size
num_feats

18545

In [14]:
vectorizedRDD.map(lambda x: x[0]).mean()

0.254003558718861

## test prediction making

In [21]:
#sc.addPyFile("./fm_function_tester.py")
#import fm_function_tester as fm
#from fm_function_tester import *

In [67]:
# initialize weights
np.random.seed(24)

b_br = sc.broadcast(0.0)
w_br = sc.broadcast(np.zeros(num_feats, dtype=np.float16))
k_br = sc.broadcast(2)
V_br = sc.broadcast(np.random.normal(0.0, 0.5, (k_br.value, num_feats)))
V_br.value.shape

(2, 30946)

In [24]:
label = vectorizedRDD.first()[0]
feats = vectorizedRDD.first()[1]

In [28]:
feats.indices

array([    0,     1,     2,     4,     5,     7,    10,    21,    33,
         122,   163,   179,   209,   318,   432,   474,   521,   565,
         607,   634,  1431,  1443,  1509,  1541,  3524,  7034,  7941,
       10434, 11488, 14157, 15046, 15368, 15404, 16184], dtype=int32)

In [42]:
w_br.value[feats.indices]

IndexError: index (16184) out of range

In [29]:
np.where(feats.indices)

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]),)

In [33]:
w_br.value

<1x18545 sparse matrix of type '<class 'numpy.float16'>'
	with 0 stored elements in Compressed Sparse Row format>

In [56]:
w_br.value.tocsr()[:,feats.indices].dot(feats.values)[0]

0.0

In [48]:
np.dot(w_br.value.tocsc()[:,feats.indices], feats.values)

array([<1x34 sparse matrix of type '<class 'numpy.float32'>'
	with 0 stored elements in Compressed Sparse Column format>,
       <1x34 sparse matrix of type '<class 'numpy.float32'>'
	with 0 stored elements in Compressed Sparse Column format>,
       <1x34 sparse matrix of type '<class 'numpy.float32'>'
	with 0 stored elements in Compressed Sparse Column format>,
       <1x34 sparse matrix of type '<class 'numpy.float32'>'
	with 0 stored elements in Compressed Sparse Column format>,
       <1x34 sparse matrix of type '<class 'numpy.float32'>'
	with 0 stored elements in Compressed Sparse Column format>,
       <1x34 sparse matrix of type '<class 'numpy.float32'>'
	with 0 stored elements in Compressed Sparse Column format>,
       <1x34 sparse matrix of type '<class 'numpy.float32'>'
	with 0 stored elements in Compressed Sparse Column format>,
       <1x34 sparse matrix of type '<class 'numpy.float32'>'
	with 0 stored elements in Compressed Sparse Column format>,
       <1x34 sparse matr

In [38]:
w_br.value.tocsr()[:,feats.indices]

<1x34 sparse matrix of type '<class 'numpy.float32'>'
	with 0 stored elements in Compressed Sparse Row format>

In [35]:
w_br.value.tocsr()[feats.indices,:]

IndexError: index (16184) out of range

In [41]:
V_br.value[:,feats.indices]

<2x34 sparse matrix of type '<class 'numpy.float64'>'
	with 68 stored elements in Compressed Sparse Row format>

In [67]:
V_br.value[0,feats.indices].power(2)

<1x34 sparse matrix of type '<class 'numpy.float64'>'
	with 34 stored elements in Compressed Sparse Row format>

In [69]:
V_br.value[0,feats.indices].power(2).dot(np.square(feats.values))

array([10.55355253])

In [68]:
np.square(feats.values)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [62]:
np.square(V_br.value[0,feats.indices]).dot(np.square(feats.values))

ValueError: dimension mismatch

In [74]:
import numpy as np
from pyspark.ml.feature import CountVectorizer
from scipy.sparse import csr_matrix

def predict_grad(pair):
    """
        Compute the predicted probability AND return the gradient (?)
        Args:
            pair - records are in (label, sparse feature set) format
        Broadcast:
            b - bias term (scalar)
            w - linear weight vector (array)
            k - number of factors (def=2)
            V - factor matrix of size (d dimensions, k=2 factors)
        Returns:
            predRDD - pair of ([label, predicted probability], feature set)
    """
    
    label = pair[0]
    feats = pair[1]
    print("blah")
    # start with linear weight dot product
    #linear_sum = np.dot(w_br.value[feats.indices], feats.values)
    #linear_sum = np.dot(w_br.value.tocsr()[:,feats.indices], feats.values)
    linear_sum = w_br.value.tocsr()[:,feats.indices].dot(feats.values)[0]
    print("a")
    #linear_sum = 0.0
    #for i in feats.indices:
    #    i = int(i)
    #    linear_sum += w_br.value[i]*feats[i]

    # factor matrix interaction sum
    factor_sum = 0.0
    lh_factor = [0.0]*k_br.value
    rh_factor = [0.0]*k_br.value
    
    for f in range(0, k_br.value):
        
        #lh_factor[f] = np.dot(V_br.value[f][feats.indices], feats.values)  #KEY--this is used in v_grad matrix below
        #rh_factor[f] = np.dot(V_br.value[f][feats.indices]**2, feats.values**2)
        #lh_factor[f] = np.dot(V_br.value[f,feats.indices], feats.values)  #KEY--this is used in v_grad matrix below
        #rh_factor[f] = np.dot(V_br.value[f,feats.indices]**2, feats.values**2)
        lh_factor[f] = V_br.value[f,feats.indices].dot(feats.values)
        rh_factor[f] = V_br.value[f,feats.indices].power(2).dot(np.square(feats.values))
        
        #for i in feats.indices: #old with extra for loop
        #    i = int(i)
        #    lh_factor[f] += V_br.value[i][f]*feats[i]  #KEY--this is used in v_grad matrix below
        #    rh_factor += (V_br.value[i][f]**2) * (feats[i]**2)
        
        factor_sum += (lh_factor[f]**2 - rh_factor[f])
    factor_sum = 0.5 * factor_sum
    
    preProb = b_br.value + linear_sum + factor_sum
    
    prob = 1.0 / (1 + np.exp(-preProb))  #logit transformation
    
    #compute Gradients
    b_grad = label - prob
    
    w_grad = csr_matrix((b_grad*feats.values, feats.indices, np.array([0, feats.indices.size])), (1, w_br.value.shape[1]))
    #print(w_grad[(0,5)])
    
    v_grad = csr_matrix((V_br.value.shape[0], V_br.value.shape[1]))
    for f in range(0, k_br.value):  # WORKING
        for i in feats.indices:
            i = int(i)
            v_grad[(f,i)] = b_grad * (feats[i]*lh_factor[f] - V_br.value[f,i]*(feats[i]**2))
    #print(type(v_grad))
    
    
    return ([label, prob[0]], [b_grad[0], w_grad, v_grad])

In [53]:
# toy example
tester_grads = predict_grad(vectorizedRDD.take(1)[0])
sparse_mtx = tester_grads[1][2]
sparse_mtx[(0,5)]



0.004292632661100083

In [75]:
predRDD = vectorizedRDD.map(predict_grad).cache()
test_pred = predRDD.take(1)
test_pred

[([0, 0.9958085442146005],
  [-0.9958085442146005,
   <1x18545 sparse matrix of type '<class 'numpy.float64'>'
   	with 34 stored elements in Compressed Sparse Row format>,
   <2x18545 sparse matrix of type '<class 'numpy.float64'>'
   	with 68 stored elements in Compressed Sparse Row format>])]

In [73]:
b_br.value

0.0011070609323269635

### Get log-loss on the sample set

In [18]:
from sklearn.metrics import log_loss

In [56]:
from sklearn.metrics import log_loss

sample_preds = np.array(predRDD.map(lambda x: x[0]).take(20))
sample_loss = log_loss(sample_preds[:,0], sample_preds[:,1], normalize=True)
sample_loss

2.6458644290629403

## Reduce sparse vectors, get mean of gradients, and update parameter vectors

In [None]:
############## initialize weights ##############
# initialize 
b = 0.0
w = csr_matrix(np.zeros(num_feats, dtype=np.float16))
k = 2
V = csr_matrix(np.random.normal(0.0, 0.5, (k_br.value, num_feats)))

# broadcast
b_br = sc.broadcast(b)
w_br = sc.broadcast(w)
k_br = sc.broadcast(k)
V_br = sc.broadcast(V)

############## gradient calculation ##############
# calculate average gradient for b
bGrad = predRDD.map(lambda x: x[1][0]).mean()

# calculate average gradient for w
N = predRDD.count()
wGrad = (1/N) * predRDD.map(lambda x: x[1][1]).reduce(lambda a,b: a+b)

# calculate average gradient for V
vGrad = (1/N) * predRDD.map(lambda x: x[1][2]).reduce(lambda a,b: a+b)

############## update weights ##############
# update
b -= 0.01 * bGrad
w -= 0.01 * wGrad
V -= 0.01 * vGrad

# re-broadcast
b_br = sc.broadcast(b)
w_br = sc.broadcast(w)
V_br = sc.broadcast(V)


In [17]:
############## initialize weights ##############
# initialize 
b = 0.0
w = csr_matrix(np.zeros(num_feats, dtype=np.float16))
k = 2
#V = csr_matrix(np.random.normal(0.0, 0.5, (k_br.value, num_feats)))
V = csr_matrix(np.random.normal(0.0, 0.5, (k, num_feats)))

# broadcast
b_br = sc.broadcast(b)
w_br = sc.broadcast(w)
k_br = sc.broadcast(k)
V_br = sc.broadcast(V)

In [69]:
############## gradient calculation ##############
# calculate average gradient for b
bGrad = predRDD.map(lambda x: x[1][0]).mean()

# calculate average gradient for w
N = predRDD.count()
wGrad = (1/N) * predRDD.map(lambda x: x[1][1]).reduce(lambda a,b: a+b)

# calculate average gradient for V
vGrad = (1/N) * predRDD.map(lambda x: x[1][2]).reduce(lambda a,b: a+b)

############## update weights ##############
# update
b -= 0.01 * bGrad
w -= 0.01 * wGrad
V -= 0.01 * vGrad

# re-broadcast
b_br = sc.broadcast(b)
w_br = sc.broadcast(w)
V_br = sc.broadcast(V)

In [70]:
b

0.0011070609323269635

In [71]:
b_br.value

0.0011070609323269635

In [78]:
w

matrix([[8.30160213e-04, 7.17106039e-04, 6.34518692e-04, ...,
         2.11623696e-08, 2.12899741e-06, 3.20285805e-10]])

## Iterate predictions and parameter updates

In [30]:
# initialize weights
np.random.seed(24)

b = 0.0
w = np.zeros(num_feats, dtype=np.float16)
k = 2
#V = np.random.normal(0.0, 0.5, (k_br.value, num_feats))
V = np.random.normal(0.0, 0.5, (k, num_feats))

b_br = sc.broadcast(b)
w_br = sc.broadcast(w)
k_br = sc.broadcast(k)
V_br = sc.broadcast(V)
V_br.value.shape

(2, 30946)

In [30]:
## initialize
#b = 0.0
#w = csr_matrix(np.zeros(num_feats, dtype=np.float16))
#k = 2
#V = csr_matrix(np.random.normal(0.0, 0.5, (k_br.value, num_feats)))

# broadcast
#b_br = sc.broadcast(b)
#w_br = sc.broadcast(w)
#k_br = sc.broadcast(k)
#V_br = sc.broadcast(V)


In [43]:
def train(dataRDD, numIterations=5, learningRate = 0.01):
    
    def predict_grad(pair):
        """
            Compute the predicted probability AND return the gradient (?)
            Args:
                pair - records are in (label, sparse feature set) format
            Broadcast:
                b - bias term (scalar)
                w - linear weight vector (array)
                k - number of factors (def=2)
                V - factor matrix of size (d dimensions, k=2 factors)
            Returns:
                predRDD - pair of ([label, predicted probability], feature set)
        """

        label = pair[0]
        feats = pair[1]

        # start with linear weight dot product
        linear_sum = np.dot(w_br.value[feats.indices], feats.values)

        #linear_sum = 0.0
        #for i in feats.indices:
        #    i = int(i)
        #    linear_sum += w_br.value[i]*feats[i]

        # factor matrix interaction sum
        factor_sum = 0.0
        lh_factor = [0.0]*k_br.value
        rh_factor = [0.0]*k_br.value

        for f in range(0, k_br.value):

            lh_factor[f] = np.dot(V_br.value[f][feats.indices], feats.values)  #KEY--this is used in v_grad matrix below
            rh_factor[f] = np.dot(V_br.value[f][feats.indices]**2, feats.values**2)

            #for i in feats.indices: #old with extra for loop
            #    i = int(i)
            #    lh_factor[f] += V_br.value[i][f]*feats[i]  #KEY--this is used in v_grad matrix below
            #    rh_factor += (V_br.value[i][f]**2) * (feats[i]**2)

            factor_sum += (lh_factor[f]**2 - rh_factor[f])
        factor_sum = 0.5 * factor_sum

        preProb = b_br.value + linear_sum + factor_sum

        prob = 1.0 / (1 + np.exp(-preProb))  #logit transformation

        #compute Gradients
        b_grad = label - prob

        w_grad = csr_matrix((b_grad*feats.values, feats.indices, np.array([0, feats.indices.size])), (1, w_br.value.shape[0]))
        #print(w_grad[(0,5)])

        v_grad = csr_matrix((V_br.value.shape[0], V_br.value.shape[1]))
        for f in range(0, k_br.value):  # WORKING
            for i in feats.indices:
                i = int(i)
                v_grad[(f,i)] = b_grad * (feats[i]*lh_factor[f] - V_br.value[f][i]*(feats[i]**2))
        #print(type(v_grad))


        return ([label, prob], [b_grad, w_grad, v_grad])
    # initialize weights
    np.random.seed(24)

    b = 0.0
    w = np.zeros(num_feats, dtype=np.float16)
    k = 2
    V = np.random.normal(0.0, 0.5, (k, num_feats))

    b_br = sc.broadcast(b)
    w_br = sc.broadcast(w)
    k_br = sc.broadcast(k)
    V_br = sc.broadcast(V)
    #b_br.value
    #w_br.value.shape
    #k_br.value
    #V_br.value.shape
    
    for n in range(numIterations):
        
        predictionRDD = dataRDD.map(predict_grad).cache()
        #([label, prob], [b_grad, w_grad, v_grad])
        

        ############## gradient calculation ##############
        # calculate average gradient for b
        bGrad = predictionRDD.map(lambda x: x[1][0]).mean()

        # calculate average gradient for w
        N = predictionRDD.count()
        #wGrad = (1/N) * predictionRDD.map(lambda x: x[1][1]).reduce(lambda a,b: a+b)
        wGrad = np.multiply((1/N),predictionRDD.map(lambda x: x[1][1]).reduce(lambda a,b: a+b))

        # calculate average gradient for V
        #vGrad = (1/N) * predictionRDD.map(lambda x: x[1][2]).reduce(lambda a,b: a+b)
        vGrad = np.multiply((1/N),predictionRDD.map(lambda x: x[1][2]).reduce(lambda a,b: a+b))

        
        b_br.unpersist()
        w_br.unpersist()
        V_br.unpersist()
        ############## update weights ##############
        ## update
        #b -= learningRate * bGrad
        #w -= learningRate * wGrad
        #V -= learningRate * vGrad
        
        b = np.subtract(b, np.multiply(learningRate, bGrad))
        w = np.subtract(w, np.multiply(learningRate, wGrad))
        V = np.subtract(V, np.multiply(learningRate, vGrad))
        
        print("b:", str(b))

        # re-broadcast
        b_br = sc.broadcast(b)
        w_br = sc.broadcast(w)
        V_br = sc.broadcast(V)
        
        #track loss
        sample_preds = np.array(predictionRDD.map(lambda x: x[0]).take(20))
        sample_loss = log_loss(sample_preds[:,0], sample_preds[:,1], normalize=True)
        #print("sample loss:", str(np.exp(sample_loss)))
        print("sample loss:", str(sample_loss))
        predictionRDD.unpersist()
        
    return predictionRDD

In [44]:
predRDD = train(vectorizedRDD, numIterations=2, learningRate=0.01)

SystemError: <built-in function isinstance> returned a result with an error set

In [45]:
from sklearn.metrics import log_loss

sample_preds = np.array(predRDD.map(lambda x: x[0]).take(20))
sample_loss = log_loss(sample_preds[:,0], sample_preds[:,1], normalize=True)
sample_loss

2.6458644290629403

In [46]:
b

0.0

In [45]:
b_br.value

0.0

In [88]:
w_br.value

array([0., 0., 0., ..., 0., 0., 0.], dtype=float16)