# W261 Final Project ETL for Development Sample

### Notebook Set-Up

In [26]:
# imports
import time
import numpy as np
import pandas as pd
from pyspark.sql import Row
from pyspark.ml.feature import CountVectorizer
from pyspark.sql import DataFrame

In [27]:
%reload_ext autoreload
%autoreload 2

In [28]:
# store path to notebook
#PWD = !pwd
#PWD = PWD[0]

In [29]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "w261FinalProject"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

__`REMINDER:`__ If you are running this notebook on the course docker container, you can monitor the progress of your jobs using the Spark UI at: http://localhost:4040/jobs/

## Load Data

In [30]:
original_trainRDD = sc.textFile('data/train.txt')
original_testRDD = sc.textFile('data/test.txt')

## Transformation

### Sample

In [31]:
#change the seed for a different sample
sampleRDD1, sampleRDD2 = original_trainRDD.randomSplit([0.9999,0.0001], seed = 1)
sampleRDD2.cache()

PythonRDD[28] at RDD at PythonRDD.scala:49

In [32]:
ncol = len(sampleRDD2.take(1)[0].split('\t'))
nrow = sampleRDD2.count()
print("This sample contains", str(nrow), "rows.")

This sample contains 4478 rows.


In [33]:
print("This sample contains", str(ncol), "columns.")

This sample contains 40 columns.


In [34]:
def avgFeatures(line):
    
    count = 0
    feats = line.split('\t')[1:]
    
    for feat in feats:
        if feat != '':
            count += 1

    return count

print("There is an average of", str(round(sampleRDD2.map(avgFeatures).mean(),2)), "populated features per observation.")
sampleRDD2.map(avgFeatures).mean()

There is an average of 33.53 populated features per observation.


33.5288075033497

# Put in wide, sparse feature format

In [35]:
# function to parse raw data and tag feature values with type and feature indices
def parseCV(line):
    """
    Map record_csv_string --> (features, label)
    """

    # start of categorical features
    col_start = 14
    
    raw_values = line.split('\t')
    label = int(raw_values[0])
    
    # parse numeric features
    numericals = []
    for idx, value in enumerate(raw_values[1:col_start]):
        if value != '':
            numericals.append('n' + str(idx) + '_' + str(value))
            
    # parse categorical features
    categories = []
    for idx, value in enumerate(raw_values[col_start:]):
        if value != '':
            categories.append('c'+ str(idx) + '_' + str(value))

    return Row(label=label, raw=numericals + categories)

# function to one hot encode all features using a count vectorizer
def vectorizeCV(DF):
    
    vectorizer = CountVectorizer()
    cv = CountVectorizer(inputCol="raw", outputCol="features")
    
    model = cv.fit(DF)
    result = model.transform(DF)
    
    return result

# call functions
parsedDF = sampleRDD2.map(parseCV).toDF().cache()
vectorizedDF = vectorizeCV(parsedDF)

In [36]:
# examine transformed data
vectorizedDF.show(truncate=True)

+-----+--------------------+--------------------+
|label|                 raw|            features|
+-----+--------------------+--------------------+
|    0|[n1_4, n2_50, n3_...|(30946,[0,1,2,4,5...|
|    0|[n1_12, n2_20, n3...|(30946,[0,1,2,5,1...|
|    1|[n1_1, n2_1, n4_9...|(30946,[0,1,6,7,1...|
|    0|[n0_8, n1_17, n3_...|(30946,[0,1,4,12,...|
|    1|[n0_6, n1_1, n2_7...|(30946,[0,1,2,4,1...|
|    1|[n1_99, n2_1, n3_...|(30946,[1,2,4,10,...|
|    0|[n0_3, n1_21, n2_...|(30946,[0,1,4,8,1...|
|    0|[n1_2, n2_20, n3_...|(30946,[0,1,3,5,8...|
|    0|[n0_0, n1_144, n4...|(30946,[0,2,3,4,5...|
|    0|[n1_0, n2_5, n4_3...|(30946,[0,2,3,6,1...|
|    0|[n0_0, n1_1, n2_4...|(30946,[0,1,2,3,5...|
|    0|[n0_9, n1_5, n2_1...|(30946,[0,2,3,6,9...|
|    0|[n1_323, n2_2, n3...|(30946,[1,2,14,16...|
|    0|[n0_0, n1_424, n3...|(30946,[0,1,2,4,6...|
|    0|[n0_0, n1_13, n2_...|(30946,[0,1,2,5,6...|
|    0|[n1_180, n2_6, n3...|(30946,[1,2,8,14,...|
|    0|[n1_126, n2_2, n3...|(30946,[0,2,4,6,8...|


In [89]:
k = 2
n = vectorizedDF.select('features').head()[0].size

b = 1
w = np.zeros(n)
V = np.zeros((n,k))


def updateSGD(dataRDD, b, w, V, learningRate = 0.1):
    
    bGrad = 1
    wGrad = w
    
    vGradLH = 0.0
    vGradRH = 0.0
    for i in range(n):
        vGradLH
        vGradRH
        
    vGrad = vGradLH + vGradRH
    
    
    b -= learningRate * bGrad
    w -= learningRate * wGrad
    V -= learningRate * vGrad
    
    return b, w, V

pairRDD = vectorizedDF.select(['label','features']).rdd.map(tuple)
b, w, V = updateSGD(pairRDD, b, w, V)

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       ...,
       [0., 0.],
       [0., 0.],
       [0., 0.]])

In [98]:
pairRDD = vectorizedDF.select(['label','features']).rdd.map(tuple)
pairRDD.map(lambda x: x[1]) \
        .take(1)

[SparseVector(30946, {0: 1.0, 1: 1.0, 2: 1.0, 4: 1.0, 5: 1.0, 7: 1.0, 10: 1.0, 20: 1.0, 32: 1.0, 122: 1.0, 155: 1.0, 173: 1.0, 214: 1.0, 364: 1.0, 369: 1.0, 495: 1.0, 504: 1.0, 632: 1.0, 635: 1.0, 832: 1.0, 1893: 1.0, 2105: 1.0, 2199: 1.0, 2257: 1.0, 2777: 1.0, 6258: 1.0, 11801: 1.0, 13281: 1.0, 14212: 1.0, 18956: 1.0, 23964: 1.0, 25490: 1.0, 25627: 1.0, 27452: 1.0})]

In [78]:
def calcLoss(pair):
    """
        Compute the loss for the current model weights
        Args:
            pair - records are in (label, sparse feature set) format
        Broadcast:
            b - bias term (scalar)
            w - linear weight vector (array)
            k - number of factors (def=2)
            V - factor matrix of size (d dimensions, k=2 factors)
        Returns:
            loss - float
    """
    
    # start with linear weight dot product
    linear_sum = 0.0
    for i in pair[1].indices:
        linear_sum += w[i]*pair[1].values[i]

    # factor matrix interaction sum
    factor_sum = 0.0
    lh_factor = 0.0
    rh_factor = 0.0
    
    for f in range(0, k):
        
        for i in pair[1].indices:
            lh_factor += V[i][f]*pair[1].values[i]
            rh_factor += (V[i][f]**2) * (pair[1].values[i]**2)
        
        factor_sum += (lh_factor**2 - rh_factor)
        
    factor_sum = 0.5 * factor_sum
    pred = b + linear_sum + factor_sum
    
    SE = augmentedData.map(lambda x: (pred - pair[0][1])**2)    
    loss = SE.mean()
    
    return loss

In [83]:
pairRDD = vectorizedDF.select(['label','features']).rdd.map(tuple)
pairRDD.take(1)

[(0,
  SparseVector(30946, {0: 1.0, 1: 1.0, 2: 1.0, 4: 1.0, 5: 1.0, 7: 1.0, 10: 1.0, 20: 1.0, 32: 1.0, 122: 1.0, 155: 1.0, 173: 1.0, 214: 1.0, 364: 1.0, 369: 1.0, 495: 1.0, 504: 1.0, 632: 1.0, 635: 1.0, 832: 1.0, 1893: 1.0, 2105: 1.0, 2199: 1.0, 2257: 1.0, 2777: 1.0, 6258: 1.0, 11801: 1.0, 13281: 1.0, 14212: 1.0, 18956: 1.0, 23964: 1.0, 25490: 1.0, 25627: 1.0, 27452: 1.0}))]