# W261 Final Project ETL for Development Sample

### Notebook Set-Up

In [26]:
# imports
#import time
import numpy as np
#import pandas as pd

In [27]:
#%reload_ext autoreload
#%autoreload 2

In [28]:
# store path to notebook
#PWD = !pwd
#PWD = PWD[0]

In [29]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "w261FinalProject"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

__`REMINDER:`__ If you are running this notebook on the course docker container, you can monitor the progress of your jobs using the Spark UI at: http://localhost:4040/jobs/

## Load Data

In [30]:
original_trainRDD = sc.textFile('data/train.txt')
original_testRDD = sc.textFile('data/test.txt')

## Parse Training Data

In [31]:
def parse(line):
    """
    Map record_csv_string --> (tuple,of,fields)
    """
    raw_values = line.split('\t')
    label = [int(raw_values[0])]
    numerical_values = list(pd.Series(raw_values[1:14]).apply(pd.to_numeric))
    categorical_values = list([str(idx)+"_MISSINGVALUE" if str(value)=="" else str(idx)+"_"+str(value) for idx,value in enumerate(raw_values[14:])])
    return(numerical_values + categorical_values + label)

## Transformation

### Sample

In [32]:
#change the seed for a different sample
sampleRDD1, sampleRDD2 = original_trainRDD.randomSplit([0.9999,0.0001], seed = 1)
smallSampleRDDCached = sampleRDD2.map(parse).cache()
#smallSampleRDDCached.take(1)

[[nan,
  4.0,
  50.0,
  18.0,
  3339.0,
  20.0,
  26.0,
  17.0,
  133.0,
  nan,
  2.0,
  nan,
  18.0,
  '0_09ca0b81',
  '1_09e68b86',
  '2_86c4b829',
  '3_e3d0459f',
  '4_25c83c98',
  '5_MISSINGVALUE',
  '6_7227c706',
  '7_0b153874',
  '8_a73ee510',
  '9_305a0646',
  '10_9625b211',
  '11_997a695a',
  '12_dccbd94b',
  '13_07d13a8f',
  '14_36721ddc',
  '15_c0b906bb',
  '16_e5ba7672',
  '17_5aed7436',
  '18_21ddcdc9',
  '19_a458ea53',
  '20_0cbbcc92',
  '21_MISSINGVALUE',
  '22_32c7478e',
  '23_0174dd24',
  '24_3d2bedd7',
  '25_d8ecbc17',
  0]]

In [33]:
ncol = len(smallSampleRDDCached.first())
nrow = smallSampleRDDCached.count()
print("This sample contains", str(nrow), "rows.")

This sample contains 4478 rows.


In [34]:
distinctValsRDD = smallSampleRDDCached.flatMap(lambda line: line[13:-1]).distinct()
print("This sample contains", str(distinctValsRDD.count()), "distinct values.")

This sample contains 25676 distinct values.


### Get and Broadcast All Variable Names

In [35]:
#the variable names for numerical features, also defined below
numerical_names = ["n00", "n01", "n02", "n03", "n04", "n05", "n06", "n07", "n08", "n09", "n10", "n11", "n12"]

In [36]:
distinctNames = distinctValsRDD.collect()
distinctNames = numerical_names + distinctNames
distinctNamesb = sc.broadcast(distinctNames)
print("There are", str(len(distinctNames)), "variables in this sample.")

There are 25689 variables in this sample.


In [37]:
# part d - helper function to normalize the data (FILL IN THE MISSING CODE BELOW)
def normalize(dataRDD):
    """
    Scale and center data for each feature
    1. standardize to standard normal distribution
    2. add a constant, 'translation_after_standardization', 
       to ensure all values are positive (needed for subsequent log transform)
    
    Args:
        dataRDD - records are a list of features (last position is label)
    Returns:
        normedRDD - records have same format as input rdd
    """
    translation_after_standardization = 5

    numericalFeatures = dataRDD.map(lambda x: list(x[:13])).cache()

    nonNanCounts = numericalFeatures.map(lambda line: 1.0*~np.isnan(line)).reduce(lambda x,y: np.add(x,y))
    nonNanCountsb = sc.broadcast(nonNanCounts)
    
    featureMeans = numericalFeatures.reduce(lambda x,y: np.nansum(np.dstack((x, y)), 2))
    featureMeans = np.divide(featureMeans,nonNanCountsb.value)
    featureMeansb = sc.broadcast(featureMeans)
    
    featureStdev = numericalFeatures.map(lambda line: np.square(np.subtract(line, featureMeansb.value))) \
                                    .reduce(lambda x,y: np.nansum(np.dstack((x, y)), 2))
    featureStdev = np.sqrt(featureStdev/nonNanCountsb.value)
    featureStdevb = sc.broadcast(featureStdev)
    
    normedRDD = dataRDD.map(lambda x: np.add(np.divide(np.subtract(x[:13],featureMeansb.value),featureStdevb.value), translation_after_standardization).tolist()[0] + list(x[13:]))
    
    return normedRDD

In [38]:
#cache normalized data
normedRDD = normalize(smallSampleRDDCached).cache()

### Log Transform

In [39]:
logRDD = normedRDD.map(lambda x: np.log(x[:13]).tolist() + list(x[13:]))

### One-Hot Encoding

In [40]:
#these lists are needed to create the list representation of features
numerical_names = ["n00", "n01", "n02", "n03", "n04", "n05", "n06", "n07", "n08", "n09", "n10", "n11", "n12"]
category_vals = np.ones(24).tolist()

In [41]:
def convert_to_list_form(line):
    """
    Converts the list of variables to a key-value pair format
    
    Args:
        line - one record in the form of a list
        --positions 1-13 are the numerical features
        --the next 26 positions are the categorical features
        --the last position is label)
    Returns:
        pair RDD
        --key: label
        --value: list representation of the feature values (dictionary)
    """
    numbers = line[:13]
    categories = line[13:-1]
    label = line[-1]
    dict1 = dict(zip(numerical_names, numbers))
    dict2 = dict(zip(categories, category_vals))
    return (label, {**dict1, **dict2})

In [42]:
final_data_rdd = logRDD.map(convert_to_list_form)
final_data_rdd.first()

(0,
 {'n00': nan,
  'n01': 1.5546416330280177,
  'n02': 1.614858026325336,
  'n03': 1.8324388135638439,
  'n04': 1.5658974261919119,
  'n05': 1.5447893046000354,
  'n06': 1.6455071185060295,
  'n07': 1.6722403253403968,
  'n08': 1.6258544971242836,
  'n09': nan,
  'n10': 1.5791945292276104,
  'n11': nan,
  'n12': 1.7750904691093639,
  '0_09ca0b81': 1.0,
  '1_09e68b86': 1.0,
  '2_86c4b829': 1.0,
  '3_e3d0459f': 1.0,
  '4_25c83c98': 1.0,
  '5_MISSINGVALUE': 1.0,
  '6_7227c706': 1.0,
  '7_0b153874': 1.0,
  '8_a73ee510': 1.0,
  '9_305a0646': 1.0,
  '10_9625b211': 1.0,
  '11_997a695a': 1.0,
  '12_dccbd94b': 1.0,
  '13_07d13a8f': 1.0,
  '14_36721ddc': 1.0,
  '15_c0b906bb': 1.0,
  '16_e5ba7672': 1.0,
  '17_5aed7436': 1.0,
  '18_21ddcdc9': 1.0,
  '19_a458ea53': 1.0,
  '20_0cbbcc92': 1.0,
  '21_MISSINGVALUE': 1.0,
  '22_32c7478e': 1.0,
  '23_0174dd24': 1.0})

### At this point, the sample of training data is in 'final_data_rdd' and the list of variable names has been broadcasted as 'distinctNamesb'