In [1]:
import re
import ast
import time
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
DATAFILE = 'trainpt1percentsample.txt'
FIELDS = ['Label'] + ['I'+str(i) for i in range(1,14)] + ['C'+str(i) for i in range(1,27)]

In [3]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "hw3_notebook"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

ModuleNotFoundError: No module named 'pyspark'

In [23]:
data = sc.textFile(DATAFILE)
trainRDD, heldOutRDD = data.randomSplit([0.8,0.2], seed = 1)

In [81]:
def ConvertNumber(idx, num):
    """
    convert hashes to ints
    """
    if num != '':
        if idx > 13:
            return int(num, 16)
        else:
            return int(num)
    else:
        return np.nan

def parse(line):
    """
    Map tab separated record to tuple of features and label
    """
    parsed = np.array([ConvertNumber(idx, num) for idx,num in enumerate(line.split('\t'))])
    features, label = parsed[1:], parsed[0]
    return (features, label)

def logloss(dataRDD, W):
    augmentedData = dataRDD.map(lambda x: (np.append([1.0], x[0]), x[1]))
    loss = (augmentedData.
            map(lambda x: (np.clip(1 / (1 + np.exp(-1*W @ x[0])), 1e-15, 1.0 - 1e-15), x[1])).
            map(lambda x: -1*np.log(x[0]) if x[1]==1.0 else -1*np.log(1-x[0])).mean())
    return loss

In [43]:
trainRDDCached = trainRDD.map(parse).cache()

In [79]:
# test a regression on a constant
onefeature_trainrdd = trainRDDCached.map(lambda x: (np.array([]),x[1]))

In [82]:
# baseline with best possible constant value
BASELINE = np.array([-1*np.log(1 / onefeature_trainrdd.map(lambda x: x[1]).mean() - 1)])
logloss(onefeature_trainrdd, BASELINE)

0.5663501091618366

In [None]:
# can we get there with gradient descent?

In [94]:
def GDUpdate(dataRDD, W, learningRate = 0.0001):
    """
    Perform one gradient descent step/update.
    Args:
        dataRDD - records are tuples of (features_array, y)
        W       - (array) model coefficients with bias at index 0
    Returns:
        new_model - (array) updated coefficients, bias at index 0
    """
    # add a bias 'feature' of 1 at index 0
    augmentedData = dataRDD.map(lambda x: (np.append([1.0], x[0]), x[1])).cache()
    
    grad = augmentedData.map(lambda x: x[0]*(x[1] - (1/(1+np.exp(-1*np.dot(W, x[0])))))).sum()
    new_model = W + learningRate * grad 
   
    return new_model

In [96]:
nSteps = 5
model = np.array([1])
print(f"Target Loss = {logloss(onefeature_trainrdd,BASELINE)}")
print(f"Target Model = {BASELINE}")
for idx in range(nSteps):
    print("----------")
    print(f"STEP: {idx+1}")
    model = GDUpdate(onefeature_trainrdd, model)
    loss = logloss(onefeature_trainrdd, model)
    print(f"Loss: {loss}")
    print(f"Model: {[round(w,3) for w in model]}")

Target Loss = 0.5663501091618366
Target Model = [-1.07904134]
----------
STEP: 1
Loss: 0.5768600254954287
Model: [-0.754]
----------
STEP: 2
Loss: 0.5669876111570928
Model: [-0.998]
----------
STEP: 3
Loss: 0.5664033829314044
Model: [-1.055]
----------
STEP: 4
Loss: 0.5663548984651946
Model: [-1.072]
----------
STEP: 5
Loss: 0.5663505487092614
Model: [-1.077]


In [103]:
print(trainRDDCached.map(lambda x: x[0][1]).min())
print(trainRDDCached.map(lambda x: x[0][1]).max())

-2.0
13910.0


In [104]:
# test a regression on a constant and one variable
meanfeature = trainRDDCached.map(lambda x: x[0][1]).mean()
varfeature = trainRDDCached.map(lambda x: x[0][1]).variance()
twofeature_trainrdd = trainRDDCached.map(lambda x: (np.array(([x[0][1]] - meanfeature)/varfeature**.5),x[1]))

In [105]:
# expand baseline
BASELINE = np.array([-1*np.log(1 / onefeature_trainrdd.map(lambda x: x[1]).mean() - 1),0])
logloss(twofeature_trainrdd, BASELINE)

0.5663501091618366

In [106]:
nSteps = 5
model = BASELINE
print(f"BASELINE Loss = {logloss(twofeature_trainrdd,BASELINE)}")
for idx in range(nSteps):
    print("----------")
    print(f"STEP: {idx+1}")
    model = GDUpdate(twofeature_trainrdd, model)
    loss = logloss(twofeature_trainrdd, model)
    print(f"Loss: {loss}")
    print(f"Model: {[round(w,3) for w in model]}")

BASELINE Loss = 0.5663501091618366
----------
STEP: 1
Loss: 0.5655555218839737
Model: [-1.079, 0.068]
----------
STEP: 2
Loss: 0.5655148482350966
Model: [-1.08, 0.084]
----------
STEP: 3
Loss: 0.5655134289907428
Model: [-1.08, 0.087]
----------
STEP: 4
Loss: 0.5655133728075366
Model: [-1.081, 0.088]
----------
STEP: 5
Loss: 0.5655133698654112
Model: [-1.081, 0.088]
