# REGRESSION EXAMPLE

## 1. Import spark modules

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

In [2]:
spark = SparkSession\
        .builder\
        .appName("PythonPi")\
        .getOrCreate()

**Setup spark configuration and create a spark context**

In [3]:
sc = spark.sparkContext

## 2. Load and inspect data

**Load the csv file**

In [4]:
autoData = sc.textFile("./input/auto-miles-per-gallon.csv")

**Cache the RDD**

In [5]:
autoData.cache()

./input/auto-miles-per-gallon.csv MapPartitionsRDD[1] at textFile at <unknown>:0

**Display the head**

In [6]:
autoData.take(10)

['MPG,CYLINDERS,DISPLACEMENT,HORSEPOWER,WEIGHT,ACCELERATION,MODELYEAR,NAME',
 '18,8,307,130,3504,12,70,chevrolet chevelle malibu',
 '15,8,350,165,3693,11.5,70,buick skylark 320',
 '18,8,318,150,3436,11,70,plymouth satellite',
 '16,8,304,150,3433,12,70,amc rebel sst',
 '17,8,302,140,3449,10.5,70,ford torino',
 '15,8,429,198,4341,10,70,ford galaxie 500',
 '14,8,454,220,4354,9,70,chevrolet impala',
 '14,8,440,215,4312,8.5,70,plymouth fury iii',
 '14,8,455,225,4425,10,70,pontiac catalina']

## 3. Clean data

**Remove the header line**

In [7]:
dataLines = autoData.filter(lambda x: "CYLINDERS" not in x)

In [8]:
dataLines.take(5)

['18,8,307,130,3504,12,70,chevrolet chevelle malibu',
 '15,8,350,165,3693,11.5,70,buick skylark 320',
 '18,8,318,150,3436,11,70,plymouth satellite',
 '16,8,304,150,3433,12,70,amc rebel sst',
 '17,8,302,140,3449,10.5,70,ford torino']

In [9]:
dataLines.count()

398

### Clean up data using a function

**Default value for missing HP**

In [10]:
avgHP = sc.broadcast(80.0)

**Function to cleanup data**

In [11]:
def CleanupData(inputStr):
    
    global avgHP
    attList = inputStr.split(",")
    
    # replace missing HP value with default HP value 
    hpValue = attList[3]
    if hpValue == "?":
        hpValue = avgHP.value
        
    # create a row with cleaned up and converted data
    values = Row(MPG = float(attList[0]),\
                 CYLINDERS=float(attList[1]),\
                 DISPLACEMENT = float(attList[2]),\
                 HORSEPOWER = float(hpValue),\
                 WEIGHT = float(attList[4]),\
                 ACCELERATION = float(attList[5]),\
                 MODELYEAR = float(attList[6]),\
                 NAME = attList[7])
    
    return values

**Clean up usig map**

In [12]:
autoMap = dataLines.map(CleanupData)

In [13]:
autoMap.cache()

PythonRDD[5] at RDD at PythonRDD.scala:48

In [14]:
autoMap.take(10)

[Row(ACCELERATION=12.0, CYLINDERS=8.0, DISPLACEMENT=307.0, HORSEPOWER=130.0, MODELYEAR=70.0, MPG=18.0, NAME='chevrolet chevelle malibu', WEIGHT=3504.0),
 Row(ACCELERATION=11.5, CYLINDERS=8.0, DISPLACEMENT=350.0, HORSEPOWER=165.0, MODELYEAR=70.0, MPG=15.0, NAME='buick skylark 320', WEIGHT=3693.0),
 Row(ACCELERATION=11.0, CYLINDERS=8.0, DISPLACEMENT=318.0, HORSEPOWER=150.0, MODELYEAR=70.0, MPG=18.0, NAME='plymouth satellite', WEIGHT=3436.0),
 Row(ACCELERATION=12.0, CYLINDERS=8.0, DISPLACEMENT=304.0, HORSEPOWER=150.0, MODELYEAR=70.0, MPG=16.0, NAME='amc rebel sst', WEIGHT=3433.0),
 Row(ACCELERATION=10.5, CYLINDERS=8.0, DISPLACEMENT=302.0, HORSEPOWER=140.0, MODELYEAR=70.0, MPG=17.0, NAME='ford torino', WEIGHT=3449.0),
 Row(ACCELERATION=10.0, CYLINDERS=8.0, DISPLACEMENT=429.0, HORSEPOWER=198.0, MODELYEAR=70.0, MPG=15.0, NAME='ford galaxie 500', WEIGHT=4341.0),
 Row(ACCELERATION=9.0, CYLINDERS=8.0, DISPLACEMENT=454.0, HORSEPOWER=220.0, MODELYEAR=70.0, MPG=14.0, NAME='chevrolet impala', WEIGH

**Create sql dataframe**

In [15]:
autoDF = spark.createDataFrame(autoMap)

In [16]:
autoDF.show(10)

+------------+---------+------------+----------+---------+----+--------------------+------+
|ACCELERATION|CYLINDERS|DISPLACEMENT|HORSEPOWER|MODELYEAR| MPG|                NAME|WEIGHT|
+------------+---------+------------+----------+---------+----+--------------------+------+
|        12.0|      8.0|       307.0|     130.0|     70.0|18.0|chevrolet chevell...|3504.0|
|        11.5|      8.0|       350.0|     165.0|     70.0|15.0|   buick skylark 320|3693.0|
|        11.0|      8.0|       318.0|     150.0|     70.0|18.0|  plymouth satellite|3436.0|
|        12.0|      8.0|       304.0|     150.0|     70.0|16.0|       amc rebel sst|3433.0|
|        10.5|      8.0|       302.0|     140.0|     70.0|17.0|         ford torino|3449.0|
|        10.0|      8.0|       429.0|     198.0|     70.0|15.0|    ford galaxie 500|4341.0|
|         9.0|      8.0|       454.0|     220.0|     70.0|14.0|    chevrolet impala|4354.0|
|         8.5|      8.0|       440.0|     215.0|     70.0|14.0|   plymouth fury 

## 4. Data Analytics

**Describe the columns of MPG & CYLINDERS**

In [17]:
autoDF.select("MPG", "CYLINDERS").describe().show()

+-------+-----------------+------------------+
|summary|              MPG|         CYLINDERS|
+-------+-----------------+------------------+
|  count|              398|               398|
|   mean|23.51457286432161| 5.454773869346734|
| stddev|7.815984312565782|1.7010042445332125|
|    min|              9.0|               3.0|
|    max|             46.6|               8.0|
+-------+-----------------+------------------+



**Correlation between the target variables and the feature variables**

In [18]:
# iterate through each column in the dataframe
for i in autoDF.columns:
    # if data is not an instance of string
    if not(isinstance(autoDF.select(i).take(1)[0][0], str)):
        print("Correlation to MPG for", i, autoDF.stat.corr("MPG", i))

Correlation to MPG for ACCELERATION 0.42028891210165004
Correlation to MPG for CYLINDERS -0.7753962854205548
Correlation to MPG for DISPLACEMENT -0.8042028248058979
Correlation to MPG for HORSEPOWER -0.7746308409203807
Correlation to MPG for MODELYEAR 0.5792671330833091
Correlation to MPG for MPG 1.0
Correlation to MPG for WEIGHT -0.8317409332443347


## 5. Prepare data for machine learning

In [19]:
from pyspark.ml.linalg import Vectors

**A function to transform the RDD into labelled points**

In [20]:
def transformToLabeledPoint(row):
    '''a function to convert to labelled point'''
    # convert to labelled point
    lp = (row["MPG"]), Vectors.dense([row["ACCELERATION"], row["DISPLACEMENT"], row["WEIGHT"]])
    # return labelled point
    return lp

**Create the RDD of labelled points using the mapping function**

In [21]:
autoLp = autoMap.map(transformToLabeledPoint)

**Create the spark sql dataframe using the above RDD**

In [22]:
autoDF = spark.createDataFrame(autoLp, ["label", "features"])

In [23]:
autoDF.select("label", "features").show(10)

+-----+-------------------+
|label|           features|
+-----+-------------------+
| 18.0|[12.0,307.0,3504.0]|
| 15.0|[11.5,350.0,3693.0]|
| 18.0|[11.0,318.0,3436.0]|
| 16.0|[12.0,304.0,3433.0]|
| 17.0|[10.5,302.0,3449.0]|
| 15.0|[10.0,429.0,4341.0]|
| 14.0| [9.0,454.0,4354.0]|
| 14.0| [8.5,440.0,4312.0]|
| 14.0|[10.0,455.0,4425.0]|
| 15.0| [8.5,390.0,3850.0]|
+-----+-------------------+
only showing top 10 rows



## 6. Train & Performance Access the Model

### Do a train test split

In [24]:
(trainingData, testData) = autoDF.randomSplit([0.9, 0.1])

In [25]:
trainingData.count()

347

In [26]:
testData.count()

51

### Train the model

In [27]:
from pyspark.ml.regression import LinearRegression

**Create and fit the model**

In [28]:
lr = LinearRegression(maxIter = 100)

In [29]:
lrModel = lr.fit(trainingData)

**Print the metrics**

In [30]:
print("Coefficients: "+str(lrModel.coefficients))

Coefficients: [0.157076268935,-0.00772822949454,-0.0065594059488]


In [31]:
print("Intercept: "+str(lrModel.intercept))

Intercept: 42.12744833564257


### Access the model

**Compute prediction on test data**

In [32]:
predictions = lrModel.transform(testData)

In [33]:
predictions.show()

+-----+-------------------+------------------+
|label|           features|        prediction|
+-----+-------------------+------------------+
|  9.0|[18.5,304.0,4732.0]|11.644868594900043|
| 13.0|[12.0,302.0,3169.0]| 20.89168080377907|
| 13.0|[12.0,350.0,4274.0]|13.272582214622108|
| 13.0|[13.0,360.0,4654.0]|10.859801928069409|
| 14.0| [8.0,340.0,3609.0]| 17.08356438977651|
| 14.0|[13.5,351.0,4657.0]| 10.98821591014137|
| 14.0|[16.0,302.0,4638.0]| 11.88421854073837|
| 15.0|[10.0,383.0,3563.0]|17.367135733026018|
| 16.0| [9.5,400.0,4278.0]|12.467242443762505|
| 16.0|[12.0,304.0,3433.0]|19.144541174307953|
| 17.0|[15.5,250.0,3329.0]|20.793810726960285|
| 18.0|[11.0,318.0,3436.0]| 18.85959147460302|
| 18.0| [13.5,70.0,2124.0]|29.774823666405748|
| 18.0|[13.5,258.0,2962.0]|22.825134336341943|
| 18.5|[16.2,250.0,3645.0]|18.830991835395377|
| 19.0|[13.0,232.0,2634.0]|25.099015319937376|
| 19.0|[15.5,250.0,3302.0]|20.970914687577764|
| 21.0|[17.0,200.0,2875.0]|  24.3938069058429|
| 22.0|[18.0,

**Compute R2 coefficient**

In [34]:
from pyspark.ml.evaluation import RegressionEvaluator

In [35]:
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2")

In [36]:
evaluator.evaluate(predictions)

0.6846266229167892