In [24]:
import numpy as np
import csv

import pyspark.mllib.regression 
from pyspark import SparkConf, SparkContext
from pyspark.ml.classification import RandomForestClassifier as RF
from pyspark.sql import SparkSession
from pyspark.mllib.regression import LabeledPoint
from sklearn.model_selection import train_test_split
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils

In [2]:

NUM_FEATURES = 13
NUM_DATAPOINTS = 10000


In [3]:
def create_spark_context():
    """
    Creates a spark creates a spark context
    Package dependencies: pyspark.SparkContext
    Input: None
    Returns: sc - SparkContext object
    """
    conf = (SparkConf()
            .setMaster('local')
            .setAppName('RfClassifier')
            .set("spark.executor.memory", "2g"))
    sc = SparkContext(conf=conf)

    return sc

In [71]:
sc.stop()

In [4]:
sc = create_spark_context()



In [5]:
X = []
y = []

with open('kc_house_data.csv') as csvfile:
    reader = csv.reader(csvfile)
    for i, row in enumerate(reader):         
        if(i != 0):
            xi = [None]*13
            xi[0] = row[3]
            xi[1] = row[4]
            xi[2] = row[5]
            xi[3] = row[6]
            xi[4] = row[7]
            xi[5] = row[8]
            xi[6] = row[9]
            xi[7] = row[10]
            xi[8] = row[11]
            xi[9] = row[12]
            xi[10] = row[13]
            xi[11] = row[14]
            xi[12] = row[15]
            y.append(str(int(float(row[2])>530000)))
            #y.append(row[2])
            X.append(xi)
                
            
        if(i == NUM_DATAPOINTS):
            break
            


In [6]:
data = []
for i, yi in enumerate(y):
    xi = X[i]
    xi_string = ""
    xi_string+=str(yi)
    for x in xi:
        xi_string+=','
        xi_string+=str(x)
        
        
    
    data.append(xi_string)

rdd = sc.parallelize(data)

In [7]:
rdd = rdd.map(lambda line: line.split(","))

In [8]:
spark = SparkSession(sc)

In [9]:
df = rdd.map(lambda line: Row(price=line[0],
                              bedrooms=line[1], 
                              bathrooms=line[2], 
                              sqft_living=line[3],
                              sqft_lot=line[4], 
                              floors=line[5], 
                              waterfront=line[6],
                              view=line[7], 
                              condition=line[8], 
                              grade=line[9],
                              sqft_above=line[10], 
                              sqft_basement=line[11], 
                              yr_built=line[12],
                              yr_renovated=line[13])).toDF()
#df.printSchema()

In [10]:
def convertColumn(df, names, newType):
    for name in names: 
        df = df.withColumn(name, df[name].cast(newType))
    return df 

# Assign all column names to `columns`
columns = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']

# Conver the `df` columns to `FloatType()`
df = convertColumn(df, columns, FloatType())


In [11]:
from pyspark.ml.linalg import DenseVector

# Define the `input_data` 
input_data = df.rdd.map(lambda x: (x[0], DenseVector(x[1:])))

# Replace `df` with the new DataFrame
df = spark.createDataFrame(input_data, ["label", "features"])

In [12]:
from pyspark.ml.feature import StandardScaler

# Initialize the `standardScaler`
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")

# Fit the DataFrame to the scaler
scaler = standardScaler.fit(df)

# Transform the data in `df` with the scaler
scaled_df = scaler.transform(df)

# Inspect the result
scaled_df.take(2)

[Row(label=1.0, features=DenseVector([3.0, 3.0, 1.0, 7.0, 221900.0, 1180.0, 0.0, 1180.0, 5650.0, 0.0, 0.0, 1955.0, 0.0]), features_scaled=DenseVector([3.275, 4.5025, 1.9536, 6.0042, 0.5889, 1.4552, 0.0, 1.295, 0.1254, 0.0, 0.0, 69.8507, 0.0])),
 Row(label=2.25, features=DenseVector([3.0, 3.0, 2.0, 7.0, 538000.0, 2170.0, 400.0, 2570.0, 7242.0, 0.0, 0.0, 1951.0, 1991.0]), features_scaled=DenseVector([3.275, 4.5025, 3.9072, 6.0042, 1.4278, 2.6761, 0.8872, 2.8205, 0.1608, 0.0, 0.0, 69.7077, 4.7653]))]

In [13]:
train_data, test_data = df.randomSplit([.7,.3],seed=1234)

In [14]:
from pyspark.ml.regression import LinearRegression

# Initialize `lr`
lr = LinearRegression(labelCol="label", maxIter=10)

# Fit the data to the model
linearModel = lr.fit(train_data)


In [28]:
predicted = linearModel.transform(test_data)

# Extract the predictions and the "known" correct labels
predictions = predicted.select("prediction").rdd.map(lambda x: x[0])
labels = predicted.select("label").rdd.map(lambda x: x[0])

# Zip `predictions` and `labels` into a list
predictionAndLabel = predictions.zip(labels).collect()

# Print out first 5 instances of `predictionAndLabel` 
#predictionAndLabel[:5]
predictionAndLabel[:10]

[(1.6478412610750475, 0.0),
 (1.9515305444098665, 0.0),
 (2.941477850227784, 0.0),
 (0.8366467027631117, 0.75),
 (0.9956107865808672, 0.75),
 (0.9453443341760526, 0.75),
 (0.8645778693635808, 0.75),
 (1.1815014679021267, 0.75),
 (1.1629211662280081, 0.75),
 (1.18107235880575, 0.75)]

In [16]:
print(linearModel.summary.rootMeanSquaredError)

# Get the R2
linearModel.summary.r2

0.4198623388850159


0.7074292319832243

In [25]:
rf = RF(labelCol='label', featuresCol='features',numTrees=3)
fit = rf.fit(train_data)
transformed = fit.transform(test_data)

Py4JJavaError: An error occurred while calling o216.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 16.0 failed 1 times, most recent failure: Lost task 0.0 in stage 16.0 (TID 16, localhost, executor driver): java.lang.IllegalArgumentException: requirement failed: Classifier was given dataset with invalid label 0.5.  Labels must be integers in range [0, 9).
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.classification.Classifier$$anonfun$extractLabeledPoints$2.apply(Classifier.scala:84)
	at org.apache.spark.ml.classification.Classifier$$anonfun$extractLabeledPoints$2.apply(Classifier.scala:82)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1801)
	at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1158)
	at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1158)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2062)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2062)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087)
	at org.apache.spark.rdd.RDD.count(RDD.scala:1158)
	at org.apache.spark.ml.tree.impl.DecisionTreeMetadata$.buildMetadata(DecisionTreeMetadata.scala:118)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:105)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:139)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:45)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalArgumentException: requirement failed: Classifier was given dataset with invalid label 0.5.  Labels must be integers in range [0, 9).
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.classification.Classifier$$anonfun$extractLabeledPoints$2.apply(Classifier.scala:84)
	at org.apache.spark.ml.classification.Classifier$$anonfun$extractLabeledPoints$2.apply(Classifier.scala:82)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1801)
	at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1158)
	at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1158)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2062)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2062)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
