In [3]:
#Load the CSV file into a RDD
autoData = sc.textFile("data/auto-miles-per-gallon.csv")
autoData.cache()

data/auto-miles-per-gallon.csv MapPartitionsRDD[4] at textFile at NativeMethodAccessorImpl.java:-2

In [4]:
#Remove the first line (contains headers)
dataLines = autoData.filter(lambda x: "CYLINDERS" not in x)
dataLines.count()

398

#Convert the RDD into a Dense Vector. As a part of this exercise
1. Remove unwanted columns
2. Change non-numeric ( values=? ) to numeric

In [5]:
import math
from pyspark.mllib.linalg import Vectors

In [6]:
#Use default for average HP
avgHP =sc.broadcast(80.0)

In [7]:
def transformToNumeric( inputStr) :
    global avgHP
    attList=inputStr.split(",")
    
    #Replace ? values with a normal value
    hpValue = attList[3]
    if hpValue == "?":
        hpValue=avgHP.value
       
    #Filter out columns not wanted at this stage
    values= Vectors.dense([ float(attList[0]), \
                     float(attList[1]),  \
                     hpValue,    \
                     float(attList[5]),  \
                     float(attList[6])
                     ])
    return values

In [8]:
#Keep only MPG, CYLINDERS, HP,ACCELERATION and MODELYEAR
autoVectors = dataLines.map(transformToNumeric)
autoVectors.collect()

[DenseVector([18.0, 8.0, 130.0, 12.0, 70.0]),
 DenseVector([15.0, 8.0, 165.0, 11.5, 70.0]),
 DenseVector([18.0, 8.0, 150.0, 11.0, 70.0]),
 DenseVector([16.0, 8.0, 150.0, 12.0, 70.0]),
 DenseVector([17.0, 8.0, 140.0, 10.5, 70.0]),
 DenseVector([15.0, 8.0, 198.0, 10.0, 70.0]),
 DenseVector([14.0, 8.0, 220.0, 9.0, 70.0]),
 DenseVector([14.0, 8.0, 215.0, 8.5, 70.0]),
 DenseVector([14.0, 8.0, 225.0, 10.0, 70.0]),
 DenseVector([15.0, 8.0, 190.0, 8.5, 70.0]),
 DenseVector([15.0, 8.0, 170.0, 10.0, 70.0]),
 DenseVector([14.0, 8.0, 160.0, 8.0, 70.0]),
 DenseVector([15.0, 8.0, 150.0, 9.5, 70.0]),
 DenseVector([14.0, 8.0, 225.0, 10.0, 70.0]),
 DenseVector([24.0, 4.0, 95.0, 15.0, 70.0]),
 DenseVector([22.0, 6.0, 95.0, 15.5, 70.0]),
 DenseVector([18.0, 6.0, 97.0, 15.5, 70.0]),
 DenseVector([21.0, 6.0, 85.0, 16.0, 70.0]),
 DenseVector([27.0, 4.0, 88.0, 14.5, 70.0]),
 DenseVector([26.0, 4.0, 46.0, 20.5, 70.0]),
 DenseVector([25.0, 4.0, 87.0, 17.5, 70.0]),
 DenseVector([24.0, 4.0, 90.0, 14.5, 70.0]),
 

In [9]:
#Perform statistical Analysis
from pyspark.mllib.stat import Statistics
autoStats=Statistics.colStats(autoVectors)

In [10]:
autoStats.mean()

array([  23.51457286,    5.45477387,  104.10050251,   15.56809045,
         76.01005025])

In [11]:
autoStats.variance()

array([   61.08961077,     2.89341544,  1468.09062947,     7.60484823,
          13.67244282])

In [12]:
autoStats.min()

array([  9.,   3.,  46.,   8.,  70.])

In [13]:
autoStats.max()

array([  46.6,    8. ,  230. ,   24.8,   82. ])

In [14]:
Statistics.corr(autoVectors)

array([[ 1.        , -0.77539629, -0.77463084,  0.42028891,  0.57926713],
       [-0.77539629,  1.        ,  0.84275215, -0.50541949, -0.3487458 ],
       [-0.77463084,  0.84275215,  1.        , -0.68829885, -0.41559383],
       [ 0.42028891, -0.50541949, -0.68829885,  1.        ,  0.28813695],
       [ 0.57926713, -0.3487458 , -0.41559383,  0.28813695,  1.        ]])

Transform to a Data Frame for input to Machine Learing
Drop columns that are not required (low correlation)


In [15]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [16]:
def transformToLabeledPoint(inStr) :
    lp = ( float(inStr[0]), Vectors.dense([inStr[1],inStr[2],inStr[4]]))
    return lp

In [17]:
autoLp = autoVectors.map(transformToLabeledPoint)
autoDF = sqlContext.createDataFrame(autoLp,["label", "features"])
autoDF.select("label","features").show(10)

+-----+----------------+
|label|        features|
+-----+----------------+
| 18.0|[8.0,130.0,70.0]|
| 15.0|[8.0,165.0,70.0]|
| 18.0|[8.0,150.0,70.0]|
| 16.0|[8.0,150.0,70.0]|
| 17.0|[8.0,140.0,70.0]|
| 15.0|[8.0,198.0,70.0]|
| 14.0|[8.0,220.0,70.0]|
| 14.0|[8.0,215.0,70.0]|
| 14.0|[8.0,225.0,70.0]|
| 15.0|[8.0,190.0,70.0]|
+-----+----------------+
only showing top 10 rows



In [18]:
#Find correlations
numFeatures = autoDF.take(1)[0].features.size
labelRDD = autoDF.map(lambda lp: float(lp.label))
for i in range(numFeatures):
    featureRDD = autoDF.map(lambda lp: lp.features[i])
    corr = Statistics.corr(labelRDD, featureRDD, 'pearson')
    print('%d\t%g' % (i, corr))

0	-0.775396
1	-0.774631
2	0.579267


In [19]:
#Split into training and testing data
(trainingData, testData) = autoDF.randomSplit([0.9, 0.1])
trainingData.count()

366

In [20]:
testData.count()

32

In [21]:
#Build the model on training data
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(maxIter=10)
lrModel = lr.fit(trainingData)

In [22]:
print("Coefficients: " + str(lrModel.coefficients))

Coefficients: [-2.04520328376,-0.0542249995269,0.651700612139]


In [23]:
print("Intercept: " + str(lrModel.intercept))

Intercept: -9.15557896304


In [24]:
#Predict on the test data
predictions = lrModel.transform(testData)
predictions.select("prediction","label","features").show()



+------------------+-----+----------------+
|        prediction|label|        features|
+------------------+-----+----------------+
|12.729238916586793| 12.0|[8.0,160.0,72.0]|
|14.628114517844624| 14.0|[8.0,137.0,73.0]|
|  12.4571133011365| 14.0|[8.0,153.0,71.0]|
|23.546547278884972| 15.0| [6.0,72.0,75.0]|
|13.271488911855974| 15.0|[8.0,150.0,72.0]|
|27.402705914392854| 18.0| [3.0,90.0,73.0]|
|26.371430305565717| 19.0| [3.0,97.0,72.0]|
| 25.46595262969008| 20.0| [4.0,88.0,73.0]|
| 19.53189607826163| 20.0|[6.0,122.0,73.0]|
|22.461046670530905| 21.0| [6.0,80.0,74.0]|
|25.030151397843326| 22.0| [4.0,72.0,71.0]|
|26.227103858698342| 22.0| [4.0,98.0,75.0]|
| 28.34287945806384| 29.5| [4.0,71.0,76.0]|
| 25.13860139689716| 30.0| [4.0,70.0,71.0]|
| 18.26719319304236| 15.5|[8.0,142.0,79.0]|
|17.833393196827018| 18.5|[8.0,150.0,79.0]|
|23.331648516408706| 19.0|[6.0,100.0,77.0]|
| 25.82900034809404| 19.1| [6.0,90.0,80.0]|
|19.350692565765016| 19.9|[8.0,110.0,78.0]|
|  25.9394515827793| 22.0|[6.0,1

In [25]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")

In [26]:
evaluator.evaluate(predictions)

0.6268797850619028