# W261 Final Project ETL for Development Sample

### Notebook Set-Up

In [47]:
# imports
#import time
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#%reload_ext autoreload
#%autoreload 2

In [3]:
# store path to notebook
#PWD = !pwd
#PWD = PWD[0]

In [4]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "w261FinalProject"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

__`REMINDER:`__ If you are running this notebook on the course docker container, you can monitor the progress of your jobs using the Spark UI at: http://localhost:4040/jobs/

## Load Data

In [5]:
original_trainRDD = sc.textFile('data/train.txt')
original_testRDD = sc.textFile('data/test.txt')

## Parse Training Data

In [6]:
def parse(line):
    """
    Map record_csv_string --> (tuple,of,fields)
    """
    raw_values = line.split('\t')
    label = [int(raw_values[0])]
    numerical_values = list(pd.Series(raw_values[1:14]).apply(pd.to_numeric))
    categorical_values = list([str(idx)+"_MISSINGVALUE" if str(value)=="" else str(idx)+"_"+str(value) for idx,value in enumerate(raw_values[14:])])
    return(numerical_values + categorical_values, label)

## Transformation

### Sample

In [10]:
#change the seed for a different sample
sampleRDD1, sampleRDD2 = original_trainRDD.randomSplit([0.9999,0.0001], seed = 1)
smallSampleRDDCached = sampleRDD2.map(parse).cache()
example = smallSampleRDDCached.take(1)
example

[([nan,
   4.0,
   50.0,
   18.0,
   3339.0,
   20.0,
   26.0,
   17.0,
   133.0,
   nan,
   2.0,
   nan,
   18.0,
   '0_09ca0b81',
   '1_09e68b86',
   '2_86c4b829',
   '3_e3d0459f',
   '4_25c83c98',
   '5_MISSINGVALUE',
   '6_7227c706',
   '7_0b153874',
   '8_a73ee510',
   '9_305a0646',
   '10_9625b211',
   '11_997a695a',
   '12_dccbd94b',
   '13_07d13a8f',
   '14_36721ddc',
   '15_c0b906bb',
   '16_e5ba7672',
   '17_5aed7436',
   '18_21ddcdc9',
   '19_a458ea53',
   '20_0cbbcc92',
   '21_MISSINGVALUE',
   '22_32c7478e',
   '23_0174dd24',
   '24_3d2bedd7',
   '25_d8ecbc17'],
  [0])]

In [None]:
ncol = len(smallSampleRDDCached.first())
nrow = smallSampleRDDCached.count()
print("This sample contains", str(nrow), "rows.")

In [None]:
distinctValsRDD = smallSampleRDDCached.flatMap(lambda line: line[0][13:]).distinct()
print("This sample contains", str(distinctValsRDD.count()), "distinct categorical features.")

In [37]:
def avgFeatures(x):
    """get average number of populated features per row"""
    count = 0
    for feat in x[0]:
        if type(feat) == float and np.isnan(feat) == 0:
            count += 1
        elif type(feat) == str and 'MISSING' not in feat:
            count += 1
    return count

print("There is an average of", str(round(smallSampleRDDCached.map(avgFeatures).mean(),2)), "populated features per observation.")

There is an average of 31.94 populated features per observation.


# Put in wide, sparse feature format

In [87]:
testlist = [0, 1, 2, "cat", "dog"]
list2 = [1,1]
nums = 3
testArray = np.array(testlist)
testArray[0:nums]
testArray2 = np.array(testlist[:nums]+list2)
testArray2
#mat = sparse.csr_matrix(testArray2, testlist)

array([0, 1, 2, 1, 1])

In [11]:
numerical_names = ["n00", "n01", "n02", "n03", "n04", "n05", "n06", "n07", "n08", "n09", "n10", "n11", "n12"]
num_numeric = len(numerical_names)

def makeSparse(x, num_names):
    """make feature list a compressed sparse vector"""
    num_cats = len(num_names)
    names = np.array(num_names+x[num_cats:])
    vals = np.array(x[:len(num_names)]+[1]*num_cats)
    

In [36]:
from pyspark.ml.feature import StringIndexer

df = spark.createDataFrame(
    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
    ["id", "category"])

indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
indexed = indexer.fit(df).transform(df)
indexed.show()

+---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
|  0|       a|          0.0|
|  1|       b|          2.0|
|  2|       c|          1.0|
|  3|       a|          0.0|
|  4|       a|          0.0|
|  5|       c|          1.0|
+---+--------+-------------+



### Get and Broadcast All Variable Names

In [None]:
#the variable names for numerical features, also defined below
numerical_names = ["n00", "n01", "n02", "n03", "n04", "n05", "n06", "n07", "n08", "n09", "n10", "n11", "n12"]

In [None]:
distinctNames = distinctValsRDD.collect()
distinctNames = numerical_names + distinctNames
distinctNamesb = sc.broadcast(distinctNames)
print("There are", str(len(distinctNames)), "variables in this sample.")

In [None]:
# part d - helper function to normalize the data (FILL IN THE MISSING CODE BELOW)
def normalize(dataRDD):
    """
    Scale and center data for each feature
    1. standardize to standard normal distribution
    2. add a constant, 'translation_after_standardization', 
       to ensure all values are positive (needed for subsequent log transform)
    
    Args:
        dataRDD - records are a list of features (last position is label)
    Returns:
        normedRDD - records have same format as input rdd
    """
    translation_after_standardization = 5

    numericalFeatures = dataRDD.map(lambda x: list(x[:13])).cache()

    nonNanCounts = numericalFeatures.map(lambda line: 1.0*~np.isnan(line)).reduce(lambda x,y: np.add(x,y))
    nonNanCountsb = sc.broadcast(nonNanCounts)
    
    featureMeans = numericalFeatures.reduce(lambda x,y: np.nansum(np.dstack((x, y)), 2))
    featureMeans = np.divide(featureMeans,nonNanCountsb.value)
    featureMeansb = sc.broadcast(featureMeans)
    
    featureStdev = numericalFeatures.map(lambda line: np.square(np.subtract(line, featureMeansb.value))) \
                                    .reduce(lambda x,y: np.nansum(np.dstack((x, y)), 2))
    featureStdev = np.sqrt(featureStdev/nonNanCountsb.value)
    featureStdevb = sc.broadcast(featureStdev)
    
    normedRDD = dataRDD.map(lambda x: np.add(np.divide(np.subtract(x[:13],featureMeansb.value),featureStdevb.value), translation_after_standardization).tolist()[0] + list(x[13:]))
    
    return normedRDD

In [None]:
#cache normalized data
normedRDD = normalize(smallSampleRDDCached).cache()

### Log Transform

In [None]:
logRDD = normedRDD.map(lambda x: np.log(x[:13]).tolist() + list(x[13:]))

### One-Hot Encoding

In [None]:
#these lists are needed to create the list representation of features
numerical_names = ["n00", "n01", "n02", "n03", "n04", "n05", "n06", "n07", "n08", "n09", "n10", "n11", "n12"]
category_vals = np.ones(24).tolist()

In [None]:
def convert_to_list_form(line):
    """
    Converts the list of variables to a key-value pair format
    
    Args:
        line - one record in the form of a list
        --positions 1-13 are the numerical features
        --the next 26 positions are the categorical features
        --the last position is label)
    Returns:
        pair RDD
        --key: label
        --value: list representation of the feature values (dictionary)
    """
    numbers = line[:13]
    categories = line[13:-1]
    label = line[-1]
    dict1 = dict(zip(numerical_names, numbers))
    dict2 = dict(zip(categories, category_vals))
    return (label, {**dict1, **dict2})

In [None]:
final_data_rdd = logRDD.map(convert_to_list_form)
final_data_rdd.first()

### At this point, the sample of training data is in 'final_data_rdd' and the list of variable names has been broadcasted as 'distinctNamesb'

# Update categorical features with sparse matrix representation

## TO REVISIT: use splits in numerical variables, convert to strings and then use countVectorizer

In [153]:
from pyspark.sql import Row

def parseCV(line):
    """
    Map record_csv_string --> (features, label)
    """
    #vectorizer = CountVectorizer()
    
    col_start = 14
    
    raw_values = line.split('\t')
    label = int(raw_values[0])
    
    # ignore numerics to start
    #numerical_values = list(pd.Series(raw_values[1:14]).apply(pd.to_numeric))
    categories = []
    for idx, value in enumerate(raw_values[col_start:]):
        if value != '':
            categories.append('c'+ str(idx) + '_' + str(value))
    #feats = vectorizer.fit_transform([' '.join(categories)])
    return Row(label=label, feats=' '.join(categories))


#test = '\t'.join(['1', 'cat', 'dog'])
#parsed = parseCV(test)
#parsed

parsedRDD = sampleRDD2.map(parseCV).cache()

In [162]:
from pyspark.sql import DataFrame

parsedDF = spark.createDataFrame(parsedRDD).cache()
parsedDF.select("feats").head(1)[0]

Row(feats='c0_09ca0b81 c1_09e68b86 c2_86c4b829 c3_e3d0459f c4_25c83c98 c6_7227c706 c7_0b153874 c8_a73ee510 c9_305a0646 c10_9625b211 c11_997a695a c12_dccbd94b c13_07d13a8f c14_36721ddc c15_c0b906bb c16_e5ba7672 c17_5aed7436 c18_21ddcdc9 c19_a458ea53 c20_0cbbcc92 c22_32c7478e c23_0174dd24 c24_3d2bedd7 c25_d8ecbc17')

In [164]:
vectorizer = CountVectorizer()

feats = vectorizer.fit_transform(parsedDF.select('feats').rdd.map(lambda x: x[0]).collect())
print(feats.toarray().shape)
#print(vectorizer.get_feature_names())

(4478, 25664)


In [167]:
sparseDF = spark.createDataFrame([parsedDF.select('label'), feats])

AssertionError: dataType should be DataType

In [169]:
print(vectorizer.get_feature_names()[:10])

['c0_05db9164', 'c0_07699494', 'c0_09ca0b81', 'c0_0a16e1d4', 'c0_0c365a37', 'c0_1464facd', 'c0_16a99cfb', 'c0_17f69355', 'c0_18988050', 'c0_1a5f926e']


In [168]:
weights = feats.toarray().shape[1]
weights

25664

In [126]:
from pyspark.mllib.linalg import Matrices

# convert to pyspark SparseMatrix
sparse_matrix = Matrices.sparse(feats.shape[0],feats.shape[1],feats.indptr,feats.indices,feats.data)
type(sparse_matrix)

#sparseRDD = sc.parallelize(feats)

ValueError: Expected colPtrs of size 25665, got 4479.