In [1]:
#Load Data

In [2]:
raw_data = sc.textFile('millionsong.txt')

In [3]:
raw_data.first()

u'2001.0,0.884123733793,0.610454259079,0.600498416968,0.474669212493,0.247232680947,0.357306088914,0.344136412234,0.339641227335,0.600858840135,0.425704689024,0.60491501652,0.419193351817'

In [4]:
from pyspark.mllib.regression import LabeledPoint
import numpy as np

df = raw_data.map(lambda x: x.split(",")).map(lambda x: LabeledPoint(x[0],x[1:])).toDF(['features','label'])

In [5]:
df.selectExpr('MAX(label)','MIN(label)').show()

+----------+----------+
|MAX(label)|MIN(label)|
+----------+----------+
|    2011.0|    1922.0|
+----------+----------+



In [6]:
from pyspark.sql.functions import col

parsed_df = df.select(col('label')-1922, 'features')\
              .withColumnRenamed("(label - 1922)",'label')
parsed_df.head()

Row(label=79.0, features=DenseVector([0.8841, 0.6105, 0.6005, 0.4747, 0.2472, 0.3573, 0.3441, 0.3396, 0.6009, 0.4257, 0.6049, 0.4192]))

In [7]:
train_df, val_df, test_df = parsed_df.randomSplit([0.8,0.1,0.1])

train_df.cache() 
val_df.cache() 
test_df.cache()

DataFrame[label: double, features: vector]

In [8]:
avg = test_df.agg({"label":"mean"}).map(lambda x: x[0]).collect()[0]

In [9]:
#baseline model - use average to predict

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import lit

evaluator = RegressionEvaluator(predictionCol="prediction")

baseline_pred_label_df = train_df.select('label').withColumn('prediction',lit(avg))

print "Baseline Test RMSE is equal to %s" %(evaluator.evaluate(baseline_pred_label_df)) 

Baseline Test RMSE is equal to 21.500234008


In [10]:
#Gradient Descent by hand

In [26]:
from pyspark.mllib.linalg import DenseVector

evaluator = RegressionEvaluator(predictionCol="prediction")
def calc_RMSE(dataset):
    """Calculates the root mean squared error for an dataset of (prediction, label) tuples.

    Args:
        dataset (DataFrame of (float, float)): A `DataFrame` consisting of (prediction, label) tuples.

    Returns:
        float: The square root of the mean of the squared errors.
    """
#    evaluator = RegressionEvaluator(predictionCol="prediction")
    return evaluator.evaluate(dataset)

def gradient_summand(weights, lp):
    """Calculates the gradient summand for a given weight and `LabeledPoint`."""
    summand = DenseVector((DenseVector.dot(lp.features,weights) - lp.label)*lp.features)
    return summand

def get_labeled_prediction(weights, observation):
    """Calculates predictions given a tuple of (labeledpoint,features) 
       and returns a (prediction, label) tuple."""
    
    prediction = float(DenseVector.dot(DenseVector(weights),observation.features))
    label = float(observation.label)
    
    return prediction,label

In [12]:
d = len(train_df.first().features)
w = np.zeros(d)
train_df.map(lambda x: get_labeled_prediction(w,x)).first()

(0.0, 8.0)

In [13]:
train_df.map(lambda x: gradient_summand(w,x)).first()

DenseVector([-1.3589, -2.7773, -2.4829, -1.4645, -2.5551, -4.9228, -1.8748, -5.2439, -2.4979, -4.9767, -2.9213, -3.2111])

In [14]:
def linreg_gradient_descent(train_data, num_iters):
    """Calculates the weights and error for a linear regression model trained with gradient descent.

    Returns a tuple of (weights, training errors).  Weights will be the
            final weights (one weight per feature) for the model, and training errors will contain
            an error (RMSE) for each iteration of the algorithm.
    """
    # The length of the training data
    n = train_data.count()
    # The number of features in the training data
    d = len(train_data.first().features)
    w = np.zeros(d)
    alpha = 1.0
    # We will compute and store the training error after each iteration
    error_train = np.zeros(num_iters)
    for i in range(num_iters):
        preds_and_labels_train = train_data.map(lambda x: get_labeled_prediction(w,x))
        preds_and_labels_train_df = preds_and_labels_train.toDF(["prediction", "label"])
        error_train[i] = calc_RMSE(preds_and_labels_train_df)

        # Calculate the `gradient`.  Make use of the `gradient_summand` function you wrote in (3a).
        # Note that `gradient` should be a `DenseVector` of length `d`.
        gradient = train_data.map(lambda x: gradient_summand(w,x)).sum()

        # Update the weights
        alpha_i = alpha / (n * np.sqrt(i+1))
        w = w - alpha_i * gradient
        
    return w, error_train
    

linreg_gradient_descent(train_df, 100)

(array([ 25.77262172,  23.83357464,  -3.92820735,   8.6583658 ,
          6.12917634,  -9.18378568,  18.50990266,   2.15926399,
          9.78004183,   5.13518291,  11.92855951,   1.96051564]),
 array([  57.96525775,  105.26802754,  111.15816127,   77.13591876,
          39.55728551,   22.76245899,   20.34147179,   20.1727166 ,
          20.10011465,   20.03379812,   19.97199537,   19.91405699,
          19.85947377,   19.80783741,   19.75881468,   19.71212947,
          19.66755003,   19.62487964,   19.58394962,   19.54461411,
          19.50674599,   19.47023365,   19.43497852,   19.40089304,
          19.36789901,   19.33592627,   19.3049116 ,   19.27479777,
          19.24553285,   19.21706946,   19.18936433,   19.16237775,
          19.13607322,   19.11041708,   19.08537824,   19.06092786,
          19.03703921,   19.01368739,   18.99084921,   18.968503  ,
          18.94662851,   18.92520674,   18.90421989,   18.88365121,
          18.86348493,   18.84370622,   18.82430104,   18.

####  Train the model
#### Now let's train a linear regression model on all of our training data and evaluate its accuracy on the validation set.
#### Note that the test set will not be used here. If we evaluated the model on the test set, we would bias our final results.

In [41]:
num_iters = 50
weights_LR0, error_train_LR0 = linreg_gradient_descent(train_df,num_iters)

preds_and_labels = (val_df
                      .map(lambda x: get_labeled_prediction(weights_LR0,x)))
preds_and_labels_df = sqlContext.createDataFrame(preds_and_labels, ["prediction", "label"])
rmse_val_LR0 = calc_RMSE(preds_and_labels_df)

print 'Validation RMSE:\n\tBaseline = {0:.3f}\n\tLR0 = {1:.3f}'.format(avg,
                                                                       rmse_val_LR0)

Validation RMSE:
	Baseline = 54.016
	LR0 = 18.695


# MLlib implemenatation

In [16]:
from pyspark.ml.regression import LinearRegression
# Values to use when training the linear regression model

num_iters = 500  # iterations
reg = 1e-1  # regParam
alpha = .2  # elasticNetParam
use_intercept = True  # intercept

# TODO: Replace <FILL IN> with appropriate code
lin_reg = LinearRegression(maxIter=num_iters, regParam=reg, 
                           elasticNetParam=0.1, fitIntercept=True)
first_model = lin_reg.fit(train_df)

# coeffsLR1 stores the model coefficients; interceptLR1 stores the model intercept
coeffs_LR1 = first_model.coefficients
intercept_LR1 = first_model.intercept
print coeffs_LR1, intercept_LR1

[24.363862651,24.912341236,-67.5971290696,56.9051814471,-10.6287251479,-52.0090410333,33.0080849511,-22.2838782225,2.46575768152,-2.65959623067,-11.9921603324,-11.2864888329] 64.7416745647


In [17]:
pred = first_model.transform(train_df)
pred.select('label','prediction').toPandas().head(10)



Unnamed: 0,label,prediction
0,8,14.903245
1,8,15.116023
2,8,17.76734
3,11,19.693845
4,13,33.519425
5,19,9.911329
6,19,24.198499
7,19,28.875068
8,19,30.121777
9,20,22.203674


In [38]:
from pyspark.sql import Row
validation = first_model.transform(val_df).select('prediction','label')
rmse_val_LR1 = evaluator.evaluate(validation)

In [40]:
print avg, rmse_val_LR0, rmse_val_LR1

54.0157232704 18.7501612875 15.873525187


u'1.6.2'