In [1]:
#Load Data

In [5]:
raw_data = sc.textFile('/tmp/millionsong.txt')

In [6]:
raw_data.first()

u'2001.0,0.884123733793,0.610454259079,0.600498416968,0.474669212493,0.247232680947,0.357306088914,0.344136412234,0.339641227335,0.600858840135,0.425704689024,0.60491501652,0.419193351817'

In [7]:
from pyspark.mllib.regression import LabeledPoint
import numpy as np

df = raw_data.map(lambda x: x.split(",")).map(lambda x: LabeledPoint(x[0],x[1:])).toDF(['features','label'])

In [8]:
df.selectExpr('MAX(label)','MIN(label)').show()

+----------+----------+
|MAX(label)|MIN(label)|
+----------+----------+
|    2011.0|    1922.0|
+----------+----------+



In [9]:
from pyspark.sql.functions import col

parsed_df = df.select(col('label')-1922, 'features')\
              .withColumnRenamed("(label - 1922)",'label')
parsed_df.head()

Row(label=79.0, features=DenseVector([0.8841, 0.6105, 0.6005, 0.4747, 0.2472, 0.3573, 0.3441, 0.3396, 0.6009, 0.4257, 0.6049, 0.4192]))

In [10]:
train_df, val_df, test_df = parsed_df.randomSplit([0.8,0.1,0.1])

train_df.cache() 
val_df.cache() 
test_df.cache()

DataFrame[label: double, features: vector]

In [11]:
avg = test_df.agg({"label":"mean"}).map(lambda x: x[0]).collect()[0]

In [12]:
#baseline model - use average to predict

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import lit

evaluator = RegressionEvaluator(predictionCol="prediction")

baseline_pred_label_df = train_df.select('label').withColumn('prediction',lit(avg))

print "Baseline Test RMSE is equal to %s" %(evaluator.evaluate(baseline_pred_label_df)) 

Baseline Test RMSE is equal to 21.4410420021


In [13]:
#Gradient Descent by hand

In [24]:
from pyspark.mllib.linalg import DenseVector

evaluator = RegressionEvaluator(predictionCol="prediction")
def calc_RMSE(dataset):
    """Calculates the root mean squared error for an dataset of (prediction, label) tuples.

    Args:
        dataset (DataFrame of (float, float)): A `DataFrame` consisting of (prediction, label) tuples.

    Returns:
        float: The square root of the mean of the squared errors.
    """
    evaluator = RegressionEvaluator(predictionCol="prediction")
    return evaluator.evaluate(dataset)

def gradient_summand(weights, lp):
    """Calculates the gradient summand for a given weight and `LabeledPoint`."""
    summand = DenseVector((DenseVector.dot(lp.features,weights) - lp.label)*lp.features)
    return summand

def get_labeled_prediction(weights, observation):
    """Calculates predictions given a tuple of (labeledpoint,features) 
       and returns a (prediction, label) tuple."""
    
    prediction = float(DenseVector.dot(DenseVector(weights),observation.features))
    label = float(observation.label)
    
    return prediction,label

In [15]:
d = len(train_df.first().features)
w = np.zeros(d)
train_df.map(lambda x: get_labeled_prediction(w,x)).first()

(0.0, 79.0)

In [16]:
train_df.map(lambda x: gradient_summand(w,x)).first()

DenseVector([-69.8458, -48.2259, -47.4394, -37.4989, -19.5314, -28.2272, -27.1868, -26.8317, -47.4678, -33.6307, -47.7883, -33.1163])

In [27]:
def linreg_gradient_descent(train_data, num_iters):
    """Calculates the weights and error for a linear regression model trained with gradient descent.

    Returns a tuple of (weights, training errors).  Weights will be the
            final weights (one weight per feature) for the model, and training errors will contain
            an error (RMSE) for each iteration of the algorithm.
    """
    # The length of the training data
    n = train_data.count()
    # The number of features in the training data
    d = len(train_data.first().features)
    w = np.zeros(d)
    alpha = 1.0
    # We will compute and store the training error after each iteration
    error_train = np.zeros(num_iters)
    for i in range(num_iters):
        preds_and_labels_train = train_data.map(lambda x: get_labeled_prediction(w,x))
        preds_and_labels_train_df = preds_and_labels_train.toDF(["prediction", "label"])
        error_train[i] = calc_RMSE(preds_and_labels_train_df)

        # Calculate the `gradient`.  Make use of the `gradient_summand` function you wrote in (3a).
        # Note that `gradient` should be a `DenseVector` of length `d`.
        gradient = train_data.map(lambda x: gradient_summand(w,x)).sum()

        # Update the weights
        alpha_i = alpha / (n * np.sqrt(i+1))
        w = w - alpha_i * gradient
        
    return w, error_train
    

linreg_gradient_descent(train_df, 100)

(array([ 22.48848811,  20.35340451,  -0.45991637,   8.18300814,
          5.9616902 ,  -4.19283846,  15.4245702 ,   3.77217616,
         10.33569038,   5.8586856 ,  10.95314623,   3.88831376]),
 array([  58.04966416,  105.51545512,  111.5299523 ,   77.48182423,
          39.76244849,   22.84191665,   20.39213111,   20.22514401,
          20.15589784,   20.09270968,   20.0338182 ,   19.97860503,
          19.92658539,   19.87737057,   19.83064333,   19.78614082,
          19.74364243,   19.70296088,   19.66393562,   19.62642781,
          19.59031642,   19.55549523,   19.52187041,   19.48935863,
          19.45788545,   19.42738409,   19.39779439,   19.36906188,
          19.34113712,   19.31397504,   19.28753445,   19.26177756,
          19.23666964,   19.21217865,   19.188275  ,   19.16493127,
          19.14212199,   19.11982349,   19.09801369,   19.07667199,
          19.05577909,   19.03531695,   19.0152686 ,   18.99561812,
          18.97635051,   18.95745163,   18.93890816,   18.

####  Train the model
#### Now let's train a linear regression model on all of our training data and evaluate its accuracy on the validation set.
#### Note that the test set will not be used here. If we evaluated the model on the test set, we would bias our final results.

In [29]:
num_iters = 50
weights_LR0, error_train_LR0 = linreg_gradient_descent(train_df,num_iters)

preds_and_labels = (train_df
                      .map(lambda x: get_labeled_prediction(weights_LR0,x)))
preds_and_labels_df = sqlContext.createDataFrame(preds_and_labels, ["prediction", "label"])
rmse_val_LR0 = calc_RMSE(preds_and_labels_df)

print 'Validation RMSE:\n\tBaseline = {0:.3f}\n\tLR0 = {1:.3f}'.format(avg,
                                                                       rmse_val_LR0)

Validation RMSE:
	Baseline = 53.898
	LR0 = 18.868


# MLlib implemenatation

In [35]:
from pyspark.ml.regression import LinearRegression
# Values to use when training the linear regression model

num_iters = 500  # iterations
reg = 1e-1  # regParam
alpha = .2  # elasticNetParam
use_intercept = True  # intercept

# TODO: Replace <FILL IN> with appropriate code
lin_reg = LinearRegression(maxIter=num_iters, regParam=reg, 
                           elasticNetParam=0.1, fitIntercept= False)
first_model = lin_reg.fit(train_df)

# coeffsLR1 stores the model coefficients; interceptLR1 stores the model intercept
coeffs_LR1 = first_model.coefficients
intercept_LR1 = first_model.intercept
print coeffs_LR1, intercept_LR1

[23.1415172278,33.5315246311,-59.5911663705,23.772992963,13.1904337511,-21.8018197814,72.1026876964,-9.76595898769,26.9943498558,-0.869818673973,21.4228842016,-18.2569617806] 0.0
