In [1]:
import numpy as np
import csv

import pyspark.mllib.regression 
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.mllib.regression import LabeledPoint
from sklearn.model_selection import train_test_split
from pyspark.sql import Row
from pyspark.sql.types import *


In [2]:

NUM_FEATURES = 13
NUM_DATAPOINTS = 10000


In [3]:
def create_spark_context():
    """
    Creates a spark creates a spark context
    Package dependencies: pyspark.SparkContext
    Input: None
    Returns: sc - SparkContext object
    """
    conf = (SparkConf()
            .setMaster('local')
            .setAppName('RfClassifier')
            .set("spark.executor.memory", "2g"))
    sc = SparkContext(conf=conf)

    return sc

In [71]:
sc.stop()

In [72]:
sc = create_spark_context()



In [78]:
X = []
y = []

with open('kc_house_data.csv') as csvfile:
    reader = csv.reader(csvfile)
    for i, row in enumerate(reader):         
        if(i != 0):
            xi = [None]*13
            xi[0] = row[3]
            xi[1] = row[4]
            xi[2] = row[5]
            xi[3] = row[6]
            xi[4] = row[7]
            xi[5] = row[8]
            xi[6] = row[9]
            xi[7] = row[10]
            xi[8] = row[11]
            xi[9] = row[12]
            xi[10] = row[13]
            xi[11] = row[14]
            xi[12] = row[15]
            #y.append(str(int(float(row[2])>530000)))
            y.append(row[2])
            X.append(xi)
                
            
        if(i == NUM_DATAPOINTS):
            break
            


In [79]:
data = []
for i, yi in enumerate(y):
    xi = X[i]
    xi_string = ""
    xi_string+=str(yi)
    for x in xi:
        xi_string+=','
        xi_string+=str(x)
        
        
    
    data.append(xi_string)

rdd = sc.parallelize(data)

In [80]:
rdd = rdd.map(lambda line: line.split(","))

In [81]:
spark = SparkSession(sc)

In [82]:
df = rdd.map(lambda line: Row(price=line[0],
                              bedrooms=line[1], 
                              bathrooms=line[2], 
                              sqft_living=line[3],
                              sqft_lot=line[4], 
                              floors=line[5], 
                              waterfront=line[6],
                              view=line[7], 
                              condition=line[8], 
                              grade=line[9],
                              sqft_above=line[10], 
                              sqft_basement=line[11], 
                              yr_built=line[12],
                              yr_renovated=line[13])).toDF()
#df.printSchema()

In [83]:
def convertColumn(df, names, newType):
    for name in names: 
        df = df.withColumn(name, df[name].cast(newType))
    return df 

# Assign all column names to `columns`
columns = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']

# Conver the `df` columns to `FloatType()`
df = convertColumn(df, columns, FloatType())


In [84]:
from pyspark.ml.linalg import DenseVector

# Define the `input_data` 
input_data = df.rdd.map(lambda x: (x[0], DenseVector(x[1:])))

# Replace `df` with the new DataFrame
df = spark.createDataFrame(input_data, ["label", "features"])

In [85]:
from pyspark.ml.feature import StandardScaler

# Initialize the `standardScaler`
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")

# Fit the DataFrame to the scaler
scaler = standardScaler.fit(df)

# Transform the data in `df` with the scaler
scaled_df = scaler.transform(df)

# Inspect the result
scaled_df.take(2)

[Row(label=1.0, features=DenseVector([3.0, 3.0, 1.0, 7.0, 221900.0, 1180.0, 0.0, 1180.0, 5650.0, 0.0, 0.0, 1955.0, 0.0]), features_scaled=DenseVector([3.275, 4.5025, 1.9536, 6.0042, 0.5889, 1.4552, 0.0, 1.295, 0.1254, 0.0, 0.0, 69.8507, 0.0])),
 Row(label=2.25, features=DenseVector([3.0, 3.0, 2.0, 7.0, 538000.0, 2170.0, 400.0, 2570.0, 7242.0, 0.0, 0.0, 1951.0, 1991.0]), features_scaled=DenseVector([3.275, 4.5025, 3.9072, 6.0042, 1.4278, 2.6761, 0.8872, 2.8205, 0.1608, 0.0, 0.0, 69.7077, 4.7653]))]

In [86]:
train_data, test_data = df.randomSplit([.7,.3],seed=1234)

In [68]:
from pyspark.ml.regression import LinearRegression

# Initialize `lr`
lr = LinearRegression(labelCol="label", maxIter=10)

# Fit the data to the model
linearModel = lr.fit(train_data)


In [69]:
predicted = linearModel.transform(test_data)

# Extract the predictions and the "known" correct labels
predictions = predicted.select("prediction").rdd.map(lambda x: x[0])
labels = predicted.select("label").rdd.map(lambda x: x[0])

# Zip `predictions` and `labels` into a list
predictionAndLabel = predictions.zip(labels).collect()

# Print out first 5 instances of `predictionAndLabel` 
predictionAndLabel[:5]

[(1.6478412610750475, 0.0),
 (1.9515305444098665, 0.0),
 (2.941477850227784, 0.0),
 (0.8366467027631117, 0.75),
 (0.9956107865808672, 0.75)]

In [70]:
print(linearModel.summary.rootMeanSquaredError)

# Get the R2
linearModel.summary.r2

0.4198623388850159


0.7074292319832243