
Import modules and create spark session

In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

In [2]:
#import modules
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
import pandas as pd

#create Spark session
appName = "Regression in Spark"
spark = SparkSession.builder.appName(appName).config("spark.some.config.option", "some-value").getOrCreate()

# Read data file into Spark dataFrame

In [3]:

flightSchema = StructType([
  StructField("DayofMonth", IntegerType(), False),
  StructField("DayOfWeek", IntegerType(), False),
  StructField("Carrier", StringType(), False),
  StructField("OriginAirportID", IntegerType(), False),
  StructField("DestAirportID", IntegerType(), False),
  StructField("DepDelay", IntegerType(), False),
  StructField("ArrDelay", IntegerType(), False),
])
#read csv data with our defined schema
flightDataFrame = spark.read.csv('C:/Users/hossein/Downloads/dataset/flights.csv',schema=flightSchema,header=True)
flightDataFrame.show(3)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 3 rows



# Select important data for regression feature

In [4]:
#select related column data for our regression input features
data = flightDataFrame.select("DayofMonth", "DayOfWeek", 
                              "OriginAirportID", "DestAirportID", 
                              "DepDelay", "ArrDelay")
data.show(3)

+----------+---------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+---------------+-------------+--------+--------+
|        19|        5|          11433|        13303|      -3|       1|
|        19|        5|          14869|        12478|       0|      -8|
|        19|        5|          14057|        14869|      -4|     -15|
+----------+---------+---------------+-------------+--------+--------+
only showing top 3 rows



In [5]:
data.count()

2702218

# Divide data into training and testing data

In [6]:
#divide data, 70% for training, 30% for testing
dividedData = data.randomSplit([0.8, 0.2] ) 
trainingData = dividedData[0] #index 0 = data training
testingData = dividedData[1] #index 1 = data testing
train_rows = trainingData.count()
test_rows = testingData.count()
print ("Training data rows:", train_rows, "; Testing data rows:", test_rows)

Training data rows: 2161910 ; Testing data rows: 540308


In [7]:
trainingData.show(5)

+----------+---------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+---------------+-------------+--------+--------+
|         1|        1|          10140|        10397|      -4|     -11|
|         1|        1|          10140|        10397|      -2|     -17|
|         1|        1|          10140|        10821|       8|      -9|
|         1|        1|          10140|        11259|      -2|     -14|
|         1|        1|          10140|        11259|       0|     -12|
+----------+---------+---------------+-------------+--------+--------+
only showing top 5 rows



# Prepare training data

In [8]:
#define an assembler
assembler = VectorAssembler(inputCols = [
    "DayofMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", 
    "DepDelay"], outputCol="features")
#change our features into one column using our defined assembler
trainingDataFinal = assembler.transform(trainingData).select(
    col("features"), (col("ArrDelay").cast("Int").alias("label")))
trainingDataFinal.show(truncate=False , n=3)

+------------------------------+-----+
|features                      |label|
+------------------------------+-----+
|[1.0,1.0,10140.0,10397.0,-4.0]|-11  |
|[1.0,1.0,10140.0,10397.0,-2.0]|-17  |
|[1.0,1.0,10140.0,10821.0,8.0] |-9   |
+------------------------------+-----+
only showing top 3 rows



# Train our regression model using training data

In [9]:
#call Spark linear regression we import before
algoritma = LinearRegression(
    labelCol="label",featuresCol="features", 
    maxIter=10, regParam=0.3)
#train the model
model = algoritma.fit(trainingDataFinal)
print ("Regression model is trained!")

Regression model is trained!


# Prepare testing data

In [10]:
#change our feature data into one column using our defined assembler
#just like what we did before in the training data
testingDataFinal = assembler.transform(
    testingData).select(
    col("features"), (col("ArrDelay")).cast("Int").alias("trueLabel"))
testingDataFinal.show(truncate=False, n=2)

+------------------------------+---------+
|features                      |trueLabel|
+------------------------------+---------+
|[1.0,1.0,10140.0,11259.0,-3.0]|-11      |
|[1.0,1.0,10140.0,11259.0,-1.0]|-11      |
+------------------------------+---------+
only showing top 2 rows



# Predict the testing data using our trained model

In [11]:
#predict testing data using our model
prediction = model.transform(testingDataFinal)
#show some prediction results
prediction.show(10)


+--------------------+---------+-------------------+
|            features|trueLabel|         prediction|
+--------------------+---------+-------------------+
|[1.0,1.0,10140.0,...|      -11| -6.741786539729593|
|[1.0,1.0,10140.0,...|      -11| -4.747268178110664|
|[1.0,1.0,10140.0,...|       41|  31.15406233103006|
|[1.0,1.0,10140.0,...|       -8| -7.746704913558036|
|[1.0,1.0,10140.0,...|      -13|-13.731652578963729|
|[1.0,1.0,10140.0,...|       -2| -5.753579132488012|
|[1.0,1.0,10140.0,...|       18| 27.945970610005077|
|[1.0,1.0,10140.0,...|      -16| -8.970026336806427|
|[1.0,1.0,10140.0,...|      812|  831.7194630855722|
|[1.0,1.0,10140.0,...|       -9| -6.122845074706012|
+--------------------+---------+-------------------+
only showing top 10 rows



# Calculate our model performance

In [12]:
#import evaluator module for regression
from pyspark.ml.evaluation import RegressionEvaluator

#define our evaluator
evaluator = RegressionEvaluator(
    labelCol="trueLabel", predictionCol="prediction", metricName="rmse")
#calculate RMSE of our trained model
rmse = evaluator.evaluate(prediction)
print ("Root Mean Square Error (RMSE):", rmse)

Root Mean Square Error (RMSE): 13.196052557836095


# Classification-LogisticRegression

# Select important data for classification features and change arrival delay into binary class "late" vs "not late"

In [13]:
df = spark.read.csv('C:/Users/hossein/Downloads/dataset/flights.csv',schema=flightSchema,header=True)

In [14]:
data =df.select("DayofMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", 
"DepDelay", ((col("ArrDelay") > 15).cast("Int").alias("Late")))

data.show(10)

+----------+---------+---------------+-------------+--------+----+
|DayofMonth|DayOfWeek|OriginAirportID|DestAirportID|DepDelay|Late|
+----------+---------+---------------+-------------+--------+----+
|        19|        5|          11433|        13303|      -3|   0|
|        19|        5|          14869|        12478|       0|   0|
|        19|        5|          14057|        14869|      -4|   0|
|        19|        5|          15016|        11433|      28|   1|
|        19|        5|          11193|        12892|      -6|   0|
|        19|        5|          10397|        15016|      -1|   0|
|        19|        5|          15016|        10397|       0|   0|
|        19|        5|          10397|        14869|      15|   1|
|        19|        5|          10397|        10423|      33|   1|
|        19|        5|          11278|        10397|     323|   1|
+----------+---------+---------------+-------------+--------+----+
only showing top 10 rows



# Divide data into training and testing data

In [15]:
#divide data, 70% for training, 30% for testing
dividedData = data.randomSplit([0.8, 0.2],seed=10) 
trainingData = dividedData[0] #index 0 = data training
testingData = dividedData[1] #index 1 = data testing
train_rows = trainingData.count()
test_rows = testingData.count()
print ("Training data rows:", train_rows, "; Testing data rows:", test_rows)

Training data rows: 2161292 ; Testing data rows: 540926


# Prepare training data

In [16]:
assembler = VectorAssembler(inputCols = ["DayofMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", 
    "DepDelay"], outputCol="features")
trainingDataFinal = assembler.transform(
    trainingData).select(col("features"), col("Late").alias("label"))
trainingDataFinal.show(truncate=False, n=5)

+------------------------------+-----+
|features                      |label|
+------------------------------+-----+
|[1.0,1.0,10140.0,10397.0,-4.0]|0    |
|[1.0,1.0,10140.0,10821.0,8.0] |0    |
|[1.0,1.0,10140.0,11259.0,-2.0]|0    |
|[1.0,1.0,10140.0,11259.0,-1.0]|0    |
|[1.0,1.0,10140.0,11259.0,0.0] |0    |
+------------------------------+-----+
only showing top 5 rows



# Train our classifier model using training data

In [17]:
classifier = LogisticRegression(
    labelCol="label",featuresCol="features",maxIter=10,regParam=0.3)
#train our classifier
model = classifier.fit(trainingDataFinal)
print ("Classifier model is trained!")

Classifier model is trained!


In [18]:
# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))

Coefficients: [0.0005653235784771402,-0.0039562389945402955,-1.1156984891306539e-05,-2.863345324179474e-06,0.013976316968702994]
Intercept: -1.392177156557798


In [19]:
# We can also use the multinomial family for binary classification
mlr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")

In [20]:
# Fit the model
mlrModel = mlr.fit(trainingDataFinal)

In [21]:
# Print the coefficients and intercepts for logistic regression with multinomial family
print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix))
print("Multinomial intercepts: " + str(mlrModel.interceptVector))

Multinomial coefficients: 2 X 5 CSRMatrix
(0,4) -0.0001
(1,4) 0.0001
Multinomial intercepts: [0.6964283915935823,-0.6964283915935823]


In [22]:
# Extract the summary from the returned LogisticRegressionModel instance trained
# in the earlier example
trainingSummary = mlrModel.summary
trainingSummary.roc.show()

+---+--------------------+
|FPR|                 TPR|
+---+--------------------+
|0.0|                 0.0|
|0.0|1.857829590580804E-5|
|0.0|3.715659181161608E-5|
|0.0|5.805717470565012...|
|0.0|7.895775759968417E-5|
|0.0|1.021806274819442...|
|0.0|1.254034973642042...|
|0.0|1.463040802582383E-4|
|0.0|1.648823761640463...|
|0.0|1.834606720698544E-4|
|0.0|2.066835419521144...|
|0.0|2.275841248461485E-4|
|0.0|2.554515687048605...|
|0.0|2.786744385871206E-4|
|0.0|3.018973084693806...|
|0.0|3.135087434105106...|
|0.0|3.413761872692227E-4|
|0.0|3.622767701632567...|
|0.0|3.878219270337428...|
|0.0|4.156893708924549E-4|
+---+--------------------+
only showing top 20 rows



In [23]:
 #Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

+---+--------------------+
|FPR|                 TPR|
+---+--------------------+
|0.0|                 0.0|
|0.0|1.857829590580804E-5|
|0.0|3.715659181161608E-5|
|0.0|5.805717470565012...|
|0.0|7.895775759968417E-5|
|0.0|1.021806274819442...|
|0.0|1.254034973642042...|
|0.0|1.463040802582383E-4|
|0.0|1.648823761640463...|
|0.0|1.834606720698544E-4|
|0.0|2.066835419521144...|
|0.0|2.275841248461485E-4|
|0.0|2.554515687048605...|
|0.0|2.786744385871206E-4|
|0.0|3.018973084693806...|
|0.0|3.135087434105106...|
|0.0|3.413761872692227E-4|
|0.0|3.622767701632567...|
|0.0|3.878219270337428...|
|0.0|4.156893708924549E-4|
+---+--------------------+
only showing top 20 rows

areaUnderROC: 0.9182793015801936


# Prepare testing data

In [24]:
testingDataFinal = assembler.transform(
    testingData).select(col("features"), col("Late").alias("trueLabel"))
testingDataFinal.show(3)


+--------------------+---------+
|            features|trueLabel|
+--------------------+---------+
|[1.0,1.0,10140.0,...|        0|
|[1.0,1.0,10140.0,...|        0|
|[1.0,1.0,10140.0,...|        0|
+--------------------+---------+
only showing top 3 rows



# Predict the testing data using our trained model

In [25]:
prediction = model.transform(testingDataFinal)
predictionFinal = prediction.select(
    "features", "prediction", "probability", "trueLabel")
predictionFinal.show(truncate=False, n=10)
prediction.show(truncate=False, n=10)

+-------------------------------+----------+----------------------------------------+---------+
|features                       |prediction|probability                             |trueLabel|
+-------------------------------+----------+----------------------------------------+---------+
|[1.0,1.0,10140.0,10397.0,-2.0] |0.0       |[0.8272730425926873,0.1727269574073127] |0        |
|[1.0,1.0,10140.0,11259.0,-3.0] |0.0       |[0.8296102079456423,0.17038979205435775]|0        |
|[1.0,1.0,10140.0,11259.0,0.0]  |0.0       |[0.8236010855806822,0.17639891441931776]|0        |
|[1.0,1.0,10140.0,11259.0,35.0] |0.0       |[0.7411141075624572,0.25888589243754273]|1        |
|[1.0,1.0,10140.0,11292.0,-1.0] |0.0       |[0.8256360260370816,0.17436397396291847]|0        |
|[1.0,1.0,10140.0,11292.0,4.0]  |0.0       |[0.8153458143542055,0.18465418564579456]|0        |
|[1.0,1.0,10140.0,12266.0,838.0]|1.0       |[3.83647958813945E-5,0.9999616352041186]|1        |
|[1.0,1.0,10140.0,12889.0,-2.0] |0.0    

# Calculate our model performance

In [26]:
correctPrediction = predictionFinal.filter(
    predictionFinal['prediction'] == predictionFinal['trueLabel']).count()
totalData = predictionFinal.count()
print("correct prediction:", correctPrediction, ", total data:", totalData, 
      ", accuracy:", correctPrediction/totalData)

correct prediction: 446197 , total data: 540926 , accuracy: 0.8248762307598452


# Classification-Multinomial logistic regression

In [27]:
classifier = LogisticRegression(
    labelCol="label",featuresCol="features",maxIter=10,regParam=0.3)
#train our classifier
model = classifier.fit(trainingDataFinal)
print ("Classifier model is trained!")

Classifier model is trained!


# RandomForestClassifier

In [28]:
from pyspark.ml.classification import RandomForestClassifier

model2 = RandomForestClassifier(
    numTrees=3, maxDepth=5, seed=42, labelCol="label",featuresCol="features")
model2 = model2.fit(trainingDataFinal)
print ("Model is trained!")

Model is trained!


In [29]:
prediction = model2.transform(testingDataFinal)
predictionFinal = prediction.select(
    "features", "prediction", "probability", "trueLabel")
predictionFinal.show(truncate=False, n=3)
correctPrediction = predictionFinal.filter(
    predictionFinal['prediction'] == predictionFinal['trueLabel']).count()
totalData = predictionFinal.count()
print("correct prediction:", correctPrediction, ", total data:", 
      totalData, ", accuracy", correctPrediction/totalData)


+------------------------------+----------+----------------------------------------+---------+
|features                      |prediction|probability                             |trueLabel|
+------------------------------+----------+----------------------------------------+---------+
|[1.0,1.0,10140.0,10397.0,-2.0]|0.0       |[0.9273749301915157,0.07262506980848434]|0        |
|[1.0,1.0,10140.0,11259.0,-3.0]|0.0       |[0.9273749301915157,0.07262506980848434]|0        |
|[1.0,1.0,10140.0,11259.0,0.0] |0.0       |[0.9273749301915157,0.07262506980848434]|0        |
+------------------------------+----------+----------------------------------------+---------+
only showing top 3 rows

correct prediction: 501376 , total data: 540926 , accuracy 0.9268846385642399


In [30]:
# Select example rows to display.
prediction.select("prediction", "trueLabel", "features").show(10)


+----------+---------+--------------------+
|prediction|trueLabel|            features|
+----------+---------+--------------------+
|       0.0|        0|[1.0,1.0,10140.0,...|
|       0.0|        0|[1.0,1.0,10140.0,...|
|       0.0|        0|[1.0,1.0,10140.0,...|
|       1.0|        1|[1.0,1.0,10140.0,...|
|       0.0|        0|[1.0,1.0,10140.0,...|
|       0.0|        0|[1.0,1.0,10140.0,...|
|       1.0|        1|[1.0,1.0,10140.0,...|
|       0.0|        0|[1.0,1.0,10140.0,...|
|       0.0|        0|[1.0,1.0,10140.0,...|
|       0.0|        0|[1.0,1.0,10140.0,...|
+----------+---------+--------------------+
only showing top 10 rows



In [34]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
labelCol="trueLabel", predictionCol="prediction", metricName="accuracy")

In [35]:
accuracy = evaluator.evaluate(prediction)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.0731154


# DecisionTreeClassifier

In [36]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [37]:
#read csv data with our defined schema
flightDataFrame = spark.read.csv('C:/Users/hossein/Downloads/dataset/flights.csv',schema=flightSchema,header=True)
assembler = VectorAssembler(inputCols = ["DayofMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", 
    "DepDelay"], outputCol="features")
trainingDataFinal = assembler.transform(
    trainingData).select(col("features"), col("Late").alias("label"))
trainingDataFinal.show(truncate=False, n=5)



+------------------------------+-----+
|features                      |label|
+------------------------------+-----+
|[1.0,1.0,10140.0,10397.0,-4.0]|0    |
|[1.0,1.0,10140.0,10821.0,8.0] |0    |
|[1.0,1.0,10140.0,11259.0,-2.0]|0    |
|[1.0,1.0,10140.0,11259.0,-1.0]|0    |
|[1.0,1.0,10140.0,11259.0,0.0] |0    |
+------------------------------+-----+
only showing top 5 rows



In [38]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = trainingDataFinal.randomSplit([0.7, 0.3])

In [39]:
model2 = DecisionTreeClassifier(seed=42, labelCol="label",featuresCol="features")
model2 = model2.fit(trainingData)
print ("Model is trained!")

Model is trained!


In [40]:
prediction = model2.transform(testingDataFinal)
predictionFinal = prediction.select(
    "features", "prediction", "probability", "trueLabel")


In [41]:
# Select example rows to display.
prediction.select("prediction", "trueLabel", "features").show(10)

+----------+---------+--------------------+
|prediction|trueLabel|            features|
+----------+---------+--------------------+
|       0.0|        0|[1.0,1.0,10140.0,...|
|       0.0|        0|[1.0,1.0,10140.0,...|
|       0.0|        0|[1.0,1.0,10140.0,...|
|       1.0|        1|[1.0,1.0,10140.0,...|
|       0.0|        0|[1.0,1.0,10140.0,...|
|       0.0|        0|[1.0,1.0,10140.0,...|
|       1.0|        1|[1.0,1.0,10140.0,...|
|       0.0|        0|[1.0,1.0,10140.0,...|
|       0.0|        0|[1.0,1.0,10140.0,...|
|       0.0|        0|[1.0,1.0,10140.0,...|
+----------+---------+--------------------+
only showing top 10 rows



In [46]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="trueLabel", predictionCol="prediction", metricName="accuracy")

In [47]:
accuracy = evaluator.evaluate(prediction)
print("Test Error = %g " % (1.0 - accuracy))

Test Error = 0.0731283 


# Gradient-boosted tree classifier

In [None]:
from pyspark.ml.classification import GBTClassifier
#read csv data with our defined schema
flightDataFrame = spark.read.csv('C:/Users/hossein/Downloads/dataset/flights.csv',schema=flightSchema,header=True)
assembler = VectorAssembler(inputCols = ["DayofMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", 
    "DepDelay"], outputCol="features")
trainingDataFinal = assembler.transform(
    trainingData).select(col("features"), col("Late").alias("label"))
trainingDataFinal.show(truncate=False, n=5)

In [49]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = trainingDataFinal.randomSplit([0.7, 0.3])

In [50]:
# Train a GBT model.
gbt = GBTClassifier(labelCol="label",featuresCol="features", maxIter=10)
model2 = gbt.fit(trainingData)
print ("Model is trained!")

Model is trained!


In [51]:
# Make predictions.
predictions = model.transform(testData)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [52]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="trueLabel", predictionCol="prediction", metricName="accuracy")

In [57]:
#accuracy = evaluator.evaluate(predictions)

# Clustering

In [None]:
#import modules
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import ClusteringEvaluator

In [None]:
#read csv file using automatically inferred schema
customers = spark.read.csv('C:/Users/hossein/Downloads/dataset/customers.csv', inferSchema=True, header=True)
customers.show(3)

# Prepare the data

In [None]:
assembler = VectorAssembler(inputCols = [
    "Age", "MaritalStatus", "IncomeRange", "Gender", "TotalChildren", 
    
    "ChildrenAtHome", "Education", "Occupation", "HomeOwner", "Cars"], 
                            outputCol="features")
data = assembler.transform(customers).select('CustomerName', 'features')
data.show(truncate = False, n=10)

# Create k-Means clustering model

In [None]:
kmeans = KMeans(featuresCol=assembler.getOutputCol(), predictionCol="cluster", k=5)
model = kmeans.fit(data)
print ("Model is successfully trained!")


# Print centroid for each cluster

In [None]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)
    
    

# Cluster the data

In [None]:
prediction = model.transform(data)#cluster given data
prediction.groupBy("cluster").count().orderBy("cluster").show()#count members in each cluster

prediction.select('CustomerName', 'cluster').show(10)#show several clustered data


In [None]:
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

In [55]:
silhouette = evaluator.evaluate(prediction)
print("Silhouette with squared euclidean distance = " + str(silhouette))


Silhouette with squared euclidean distance = 0.9268716977923043
