In [1]:
#Load Data

In [12]:
raw_data = sc.textFile('millionsong.txt')

In [13]:
raw_data.first()

u'2001.0,0.884123733793,0.610454259079,0.600498416968,0.474669212493,0.247232680947,0.357306088914,0.344136412234,0.339641227335,0.600858840135,0.425704689024,0.60491501652,0.419193351817'

In [16]:
from pyspark.mllib.regression import LabeledPoint
import numpy as np

df = raw_data.map(lambda x: x.split(",")).map(lambda x: LabeledPoint(x[0],x[1:])).toDF(['features','label'])

In [20]:
df.selectExpr('MAX(label)','MIN(label)').show()

+-----------+-----------+
|'MAX(label)|'MIN(label)|
+-----------+-----------+
|     2011.0|     1922.0|
+-----------+-----------+



In [23]:
from pyspark.sql.functions import col

parsed_df = df.select(col('label')-1922, 'features')\
              .withColumnRenamed("(label - 1922)",'label')
parsed_df.head()

Row(label=79.0, features=DenseVector([0.8841, 0.6105, 0.6005, 0.4747, 0.2472, 0.3573, 0.3441, 0.3396, 0.6009, 0.4257, 0.6049, 0.4192]))

In [25]:
train_df, val_df, test_df = parsed_df.randomSplit([0.8,0.1,0.1])

train_df.cache() 
val_df.cache() 
test_df.cache()

DataFrame[label: double, features: vector]

In [59]:
avg = test_df.agg({"label":"mean"}).map(lambda x: x[0]).collect()[0]

In [70]:
#baseline model - use average to predict

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import lit

evaluator = RegressionEvaluator(predictionCol="prediction")

baseline_pred_label_df = train_df.select('label').withColumn('prediction',lit(avg))

print "Baseline Test RMSE is equal to %s" %(evaluator.evaluate(baseline_pred_label_df)) 

Baseline Test RMSE is equal to 21.4324898207


In [71]:
#Gradient Descent by hand

In [73]:
from pyspark.mllib.linalg import DenseVector

def gradient_summand(weights, lp):
    """Calculates the gradient summand for a given weight and `LabeledPoint`."""
    summand = DenseVector((DenseVector.dot(lp.features,weights) - lp.label)*lp.features)
    return summand

def get_labeled_prediction(weights, observation):
    """Calculates predictions given a tuple of (labeledpoint,features) 
       and returns a (prediction, label) tuple."""
    
    prediction = float(DenseVector.dot(DenseVector(weights),observation.features))
    label = float(observation.label)
    
    return prediction,label

In [77]:
d = len(train_df.first().features)
w = np.zeros(d)
train_df.map(lambda x: get_labeled_prediction(w,x)).first()

(0.0, 79.0)

In [78]:
train_df.map(lambda x: gradient_summand(w,x)).first()

DenseVector([-69.8458, -48.2259, -47.4394, -37.4989, -19.5314, -28.2272, -27.1868, -26.8317, -47.4678, -33.6307, -47.7883, -33.1163])

In [None]:
def linreg_gradient_descent(train_data, num_iters):
    """Calculates the weights and error for a linear regression model trained with gradient descent.

    Note:
        `DenseVector` behaves similarly to a `numpy.ndarray` and they can be used interchangably
        within this function.  For example, they both implement the `dot` method.

    Args:
        train_data (RDD of LabeledPoint): The labeled data for use in training the model.
        num_iters (int): The number of iterations of gradient descent to perform.

    Returns:
        (np.ndarray, np.ndarray): A tuple of (weights, training errors).  Weights will be the
            final weights (one weight per feature) for the model, and training errors will contain
            an error (RMSE) for each iteration of the algorithm.
    """
    # The length of the training data
    n = train_data.count()
    # The number of features in the training data
    d = len(train_data.first().features)
    w = np.zeros(d)
    alpha = 1.0
    # We will compute and store the training error after each iteration
    error_train = np.zeros(num_iters)
    for i in range(num_iters):
        # Use get_labeled_prediction from (3b) with trainData to obtain an RDD of (label, prediction)
        # tuples.  Note that the weights all equal 0 for the first iteration, so the predictions will
        # have large errors to start.
        preds_and_labels_train = train_data.map(lambda x: get_labeled_prediction(w,x))
        preds_and_labels_train_df = preds_and_labels_train.toDF(["prediction", "label"])
        #error_train[i] = calc_RMSE(preds_and_labels_train_df)

        # Calculate the `gradient`.  Make use of the `gradient_summand` function you wrote in (3a).
        # Note that `gradient` should be a `DenseVector` of length `d`.
        gradient = train_data.map(lambda x: gradient_summand(w,x)).collect()

        # Update the weights
        alpha_i = alpha / (n * np.sqrt(i+1))
        w = w - alpha_i #* gradient
        
    #return w, error_train
    return len(gradient)

linreg_gradient_descent(train_df, 1000)