In [1]:
import time
import numpy as np
from pyspark.sql.types import StructType, StructField, LongType, FloatType
import pyspark.sql

## Create Spark Session

In [2]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "w261_final"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

## Read in Data, Covert hex to int and creat DF

In [3]:
#read in the data
sampleRDD = sc.textFile('data/trainpt1percentsample.txt')

In [4]:
#split data and concer hext to int
def ConvertNumber(idx, num):
    if num != '':
        if idx > 13:
            return int(num, 16)
        elif idx == 0:
            return int(num)
        else:
            return float(num)
    else:
        return None

trainRDD = sampleRDD.map(lambda x: [ConvertNumber(idx, num) for idx,num in enumerate(x.split('\t'))]).cache()

In [5]:
#create schema for df 
structFieldList = [StructField('field_0', LongType(), True)] +\
                  [StructField('field_' + str(num), FloatType(), True) for num in range(1, 14)] +\
                  [StructField('field_' + str(num), LongType(), True) for num in range(14,40)]
schema = StructType(structFieldList)
#and create dfs
trainRDD, testRDD =(trainRDD.randomSplit([0.8,0.2], seed = 1))
trainDF = spark.createDataFrame(trainRDD, schema)
testDF = spark.createDataFrame(testRDD, schema)

In [6]:
#start the clock to see how long it takes to run following celss
startTime = time.time()

## Drop unnecessary Cols

In [7]:
fields_to_drop = ['field_1','field_3','field_4','field_6','field_10',
    'field_12','field_13','field_16','field_17','field_19','field_20',
    'field_23','field_25','field_29','field_32','field_33','field_34',
    'field_35','field_37','field_38','field_39']

for f in fields_to_drop:
    trainDF = trainDF.drop(f)
    testDF = testDF.drop(f)

In [8]:
new_time = time.time()
print('The total time was: {} seconds'.format(time.time() - startTime))

The total time was: 1.217175006866455 seconds


## Impute Mean for Continuous Variables

In [9]:
def imputeWithMean(field, traindf,testdf):
    fieldMean = traindf.rdd.map(lambda row: row[field]).filter(lambda x: x != None).mean()
    return traindf.fillna(fieldMean, [field]),testdf.fillna(fieldMean, [field])

In [10]:
fields_to_impute = ['field_5','field_7','field_8','field_9','field_11']

for m in fields_to_impute:
    trainDF,testDF = imputeWithMean(m, trainDF,testDF)

In [11]:
print('The total time was: {} seconds'.format(time.time() - new_time))
new_time = time.time()

The total time was: 18.358896493911743 seconds


## Drop Categorical Observations that are Null
- Currently, dropping all categorical columns that have any Nulls 

## Scale Continusous Data
- Should we convert one-hot and then scale all variables?

In [12]:
def scaleRow(row):
    rowDict = row.asDict()
    
    # Scale by subtracting the meann, and dividing by the stdDev
    for field in scaleDict.keys():
        rowDict[field] = float(rowDict[field]-scaleDict[field][0])/scaleDict[field][1]

    return pyspark.sql.Row(**rowDict)

In [13]:
def scaleDataFrame_fit(fields, df):

    # Note: Need to rename the 'summary' column, because using it in the filter statement tries to invoke the function
    summaryDF = df.select(fields).summary(['mean', 'stddev']).withColumnRenamed('summary', 'summary_col').cache()
    
    meanRow = summaryDF.filter(summaryDF.summary_col == 'mean').first()
    stddevRow = summaryDF.filter(summaryDF.summary_col == 'stddev').first()
    
    for field in fields:  
        scaleDict[field] = (float(meanRow[field]), float(stddevRow[field]))

In [14]:
def scaleDataFrame_transform(df):
    return df.rdd.map(scaleRow).toDF()

In [15]:
#create dict to hold key - field value - (mean, stdDev)
scaleDict = {}

# Save original column order
originalColumnOrderTrain = trainDF.columns

scaleDataFrame_fit(['field_2','field_5','field_7','field_8','field_9','field_11'], trainDF)
trainDF = scaleDataFrame_transform(trainDF)
testDF = scaleDataFrame_transform(testDF)

# Reset the original column order
trainDF = trainDF.select(originalColumnOrderTrain)
testDF = testDF.select(originalColumnOrderTrain)

In [16]:
print('The total time was: {} seconds'.format(time.time() - new_time))
new_time = time.time()

The total time was: 6.388637542724609 seconds


## One-hot Categorical Vars

In [17]:
def convertRowToArray(row):
    rowDict = row.asDict()
    
    X = np.array([])
    
    # Iterate over fields in the row
    for field in rowDict.keys():
        
        # If the field is categorical
        if field in valueDict:
            
            if rowDict[field] not in valueDict[field]:
                
                # If the value is not found in the categories for that field (rare/unknown),
                # then the encoding is all zeros
                
                X = np.append(X, np.zeros(cardinalityDict[field]))
                
            else:
                
                # If the value is found in the categories for that field      
                ohe = np.zeros(cardinalityDict[field])
                
                # Look up the value in the dictionary for this category (it is an index)
                index = valueDict[field][rowDict[field]]
                ohe[index] = 1
                X = np.append(X, ohe)
    
        # Set the actual value (Y) if the field is field_0
        elif field == 'field_0':
            Y = rowDict[field]
            
        # If the field is not categorical, then just use the existing value
        else:
            X = np.append(X, rowDict[field])
    
    return (X, Y)

In [18]:
def ohe_fit(field, topN, df):

    # Find the frequency of items in the category
    fieldFreqRDD = df.rdd.map(lambda x: (x[field], 1)).\
                          reduceByKey(lambda x, y: x+y)

    # Save the topN values in the dictionary associated with this field
    validValuesDict = {}
    index=0
    for value in fieldFreqRDD.takeOrdered(topN, key=lambda x: -x[1]):
        validValuesDict[value[0]] = index
        index += 1

    # Use the top N frequent values
    valueDict[field] = validValuesDict

In [19]:
def ohe_transform(df):
    oheRDD = df.rdd.map(lambda row: convertRowToArray(row))
    return oheRDD

In [20]:
#create dicts to hold top category values
valueDict = {}
cardinalityDict = {}

In [21]:
#set number of categories to keep per field
oheFieldsAndSizes = [('field_14', 10), ('field_15', 10), ('field_18', 10), ('field_21', 10), ('field_22', 3), ('field_24', 10), ('field_26', 10), ('field_27', 10), ('field_28', 10), ('field_30', 10), ('field_31', 10), ('field_36', 10)]

In [22]:
#loop through fields to update dicts
for fieldAndSize in oheFieldsAndSizes:

    fieldToEncode = fieldAndSize[0]
    cardinalityDict[fieldToEncode] = fieldAndSize[1]
    
    ohe_fit(fieldToEncode, fieldAndSize[1], trainDF)

In [23]:
#convert categorical vals to OHE
trainRDD = ohe_transform(trainDF)
testRDD = ohe_transform(testDF)

In [24]:
print('The total time was: {} seconds'.format(time.time() - new_time))
new_time = time.time()
print('The total time was: {} seconds'.format(time.time() - startTime))

The total time was: 33.7352979183197 seconds
The total time was: 59.70392990112305 seconds


## Run the Model

In [25]:
def logloss(dataRDD, W):
    """
    Calculates the mean log loss from an RDD of (np.array features, labels)
    and a weights array W
    """
    augmentedData = dataRDD.map(lambda x: (np.append([1.0], x[0]), x[1]))
    loss = (augmentedData.
            map(lambda x: (np.clip(1 / (1 + np.exp(-1*W @ x[0])), 1e-15, 1.0 - 1e-15), x[1])).
            map(lambda x: -1*np.log(x[0]) if x[1]==1.0 else -1*np.log(1-x[0])).mean())
    return loss

In [26]:
def GDUpdate(dataRDD, W, learningRate = 0.05, regType = None, regParam = 0):
    """
    Perform one gradient descent step/update.
    Args:
        dataRDD - records are tuples of (features_array, y)
        W       - (array) model coefficients with bias at index 0
    Returns:
        new_model - (array) updated coefficients, bias at index 0
    """
    # add a bias 'feature' of 1 at index 0
    augmentedData = dataRDD.map(lambda x: (np.append([1.0], x[0]), x[1])).cache()
    
    # use negative log likelihood to keep it descent
    grad = augmentedData.map(lambda x: -x[0]*(x[1] - (1/(1+np.exp(-1*np.dot(W, x[0])))))).mean()
    if regType=='l1':
        grad[1:] += 2 * regParam * W[1:]
    elif regType=='l2':
        grad[1:] += regParam * np.sign(W[1:])

    new_model = W - learningRate * grad 
   
    return new_model

In [27]:
def parse(line):
    """
    Map tab separated record to tuple of (39 features, label)
    """
    parsed = np.array([ConvertNumber(idx, num) for idx,num in enumerate(line.split('\t'))])
    features, label = parsed[1:], parsed[0]
    return (features, label)

In [28]:
#model from final step of prev run
W = np.array([-0.714, 0.372, -0.016, 0.3, 0.367, 0.353, 0.568, -0.086, 0.286, 0.662, 0.57, 0.103, 0.01, 0.108, 0.201, 0.507, 0.556, -0.003, 0.128, 0.712, 0.279, 0.086, 0.653, 0.526, 0.772, 0.02, 0.859, -0.35, 0.344, 0.422, 0.004, 0.639, 0.452, 0.639, 0.286, 0.53, 0.824, -0.361, -0.087, 0.189, 0.361, 0.889, 0.51, 0.806, 0.497, 0.116, 0.423, -0.725, 0.693, 0.006, 0.693, 0.761, 0.582, 0.744, 0.311, 0.273, 0.202, 0.506, 0.373, 0.449, 0.775, 0.392, 0.497, 0.779, 0.522, 0.201, 0.133, 0.547, 0.286, 0.271, 0.376, 0.036, -0.006, 0.787, -0.001, 0.64, 0.222, 0.593, 0.627, 0.899, 0.168, 0.675, 0.675, 0.806, 0.056, 0.08, 0.309, 0.757, 0.742, 0.099, -0.295, 0.615, 0.268, 0.363, 0.439, 0.22, 0.792, 0.669, 0.504, 0.329, 0.888, 0.824, 0.442, 0.423, 0.307, 0.858, 0.08, 0.589, -0.008, 0.929, 0.407, 0.26, 0.69, 0.238, 0.58, 0.096, 0.77, 0.074, 0.807, 0.094])

In [29]:
# train the logistic regression
###W = np.random.rand(120,) #only use if don't want to use model from prev run
print(f"Initial Model: {[round(w,3) for w in W]}")
print(f"Initial Train Loss = {logloss(trainRDD,W)}")
print(f"Initial Held-Out Loss = {logloss(testRDD,W)}")
n_iterations = 2
for i in range(n_iterations):
    print("-----------------------------------------------------")
    print("iteration # {}".format(i))
    W = GDUpdate(trainRDD, W, learningRate = .1, regType = 'l2', regParam = 0.1)
    print(f"New Model: {[round(w,3) for w in W]}")
    print(f"Train Loss = {logloss(trainRDD,W)}")
    print(f"Held-Out Loss = {logloss(testRDD,W)}")

Initial Model: [-0.714, 0.372, -0.016, 0.3, 0.367, 0.353, 0.568, -0.086, 0.286, 0.662, 0.57, 0.103, 0.01, 0.108, 0.201, 0.507, 0.556, -0.003, 0.128, 0.712, 0.279, 0.086, 0.653, 0.526, 0.772, 0.02, 0.859, -0.35, 0.344, 0.422, 0.004, 0.639, 0.452, 0.639, 0.286, 0.53, 0.824, -0.361, -0.087, 0.189, 0.361, 0.889, 0.51, 0.806, 0.497, 0.116, 0.423, -0.725, 0.693, 0.006, 0.693, 0.761, 0.582, 0.744, 0.311, 0.273, 0.202, 0.506, 0.373, 0.449, 0.775, 0.392, 0.497, 0.779, 0.522, 0.201, 0.133, 0.547, 0.286, 0.271, 0.376, 0.036, -0.006, 0.787, -0.001, 0.64, 0.222, 0.593, 0.627, 0.899, 0.168, 0.675, 0.675, 0.806, 0.056, 0.08, 0.309, 0.757, 0.742, 0.099, -0.295, 0.615, 0.268, 0.363, 0.439, 0.22, 0.792, 0.669, 0.504, 0.329, 0.888, 0.824, 0.442, 0.423, 0.307, 0.858, 0.08, 0.589, -0.008, 0.929, 0.407, 0.26, 0.69, 0.238, 0.58, 0.096, 0.77, 0.074, 0.807, 0.094]
Initial Train Loss = 0.8254698020736607
Initial Held-Out Loss = 0.8245669609649684
-----------------------------------------------------
iteration #