In [1]:
!pip install pyspark
!pip install findspark

# Import SparkSession
from pyspark.sql import SparkSession

# Create a Spark Session
spark = SparkSession.builder.master("local[*]").getOrCreate()

# Check Spark Session Information
spark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824025 sha256=eee74044851669f662f0363db457a03a5fba7baacc1edbcc400d5c389d9a4da0
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [3]:
# (1) Import the required Python dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [4]:
# (2) Instantiate a Spark Context
sqlContext = SQLContext(spark)



In [7]:
# (3) Load the Congressional Voting dataset (data/congressional-voting-data/house-votes-84.csv) into a Spark DataFrame
schema = StructType([
    StructField("party", StringType()),
    StructField("handicapped_infants", StringType()),
    StructField("water_project_cost_sharing", StringType()),
    StructField("adoption_of_the_budget_resolution", StringType()),
    StructField("physician_fee_freeze", StringType()),
    StructField("el_salvador_aid", StringType()),
    StructField("religious_groups_in_schools", StringType()),
    StructField("anti_satellite_test_ban", StringType()),
    StructField("aid_to_nicaraguan_contras", StringType()),
    StructField("mx_missile", StringType()),
    StructField("immigration", StringType()),
    StructField("synfuels_corporation_cutback", StringType()),
    StructField("education_spending", StringType()),
    StructField("superfund_right_to_sue", StringType()),
    StructField("crime", StringType()),
    StructField("duty_free_exports", StringType()),
    StructField("export_administration_act_south_africa", StringType())
])

congressional_voting_df = sqlContext.read.format('com.databricks.spark.csv').schema(schema).options(header = 'false', inferschema = 'false').load('/content/house-votes-84.data')
congressional_voting_df.show(2)

+----------+-------------------+--------------------------+---------------------------------+--------------------+---------------+---------------------------+-----------------------+-------------------------+----------+-----------+----------------------------+------------------+----------------------+-----+-----------------+--------------------------------------+
|     party|handicapped_infants|water_project_cost_sharing|adoption_of_the_budget_resolution|physician_fee_freeze|el_salvador_aid|religious_groups_in_schools|anti_satellite_test_ban|aid_to_nicaraguan_contras|mx_missile|immigration|synfuels_corporation_cutback|education_spending|superfund_right_to_sue|crime|duty_free_exports|export_administration_act_south_africa|
+----------+-------------------+--------------------------+---------------------------------+--------------------+---------------+---------------------------+-----------------------+-------------------------+----------+-----------+----------------------------+--------

In [8]:
# (4) Index the relevant categorical and label variables using a Pipeline of stages
categorical_columns = ['handicapped_infants', 'water_project_cost_sharing', 'adoption_of_the_budget_resolution', 'physician_fee_freeze', 'el_salvador_aid', 'religious_groups_in_schools', 'anti_satellite_test_ban', 'aid_to_nicaraguan_contras', 'mx_missile', 'immigration', 'synfuels_corporation_cutback', 'education_spending', 'superfund_right_to_sue', 'crime', 'duty_free_exports', 'export_administration_act_south_africa']
pipeline_stages = []
for categorial_column in categorical_columns:
    string_indexer = StringIndexer(inputCol = categorial_column, outputCol = categorial_column + 'Index')
    encoder = OneHotEncoder(inputCols = [string_indexer.getOutputCol()], outputCols=[categorial_column + "classVec"])
    pipeline_stages += [string_indexer, encoder]
    
label_string_idx = StringIndexer(inputCol = 'party', outputCol = 'label')
pipeline_stages += [label_string_idx]
vector_assembler_inputs = [c + "classVec" for c in categorical_columns]
vector_assembler = VectorAssembler(inputCols = vector_assembler_inputs, outputCol = "features")
pipeline_stages += [vector_assembler]

In [9]:
# (5) Generate Input Feature Vectors from the Raw Spark DataFrame by executing the previously constructed Pipeline
pipeline = Pipeline(stages = pipeline_stages)
pipeline_model = pipeline.fit(congressional_voting_df)
label_column = 'label'
congressional_voting_features_df = pipeline_model.transform(congressional_voting_df).select(['features', label_column, 'party'])
pd.DataFrame(congressional_voting_features_df.take(5), columns=congressional_voting_features_df.columns).transpose()

Unnamed: 0,0,1,2,3,4
features,"(1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, ...","(1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, ...","(0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, ...","(1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, ..."
label,1.0,1.0,0.0,0.0,0.0
party,republican,republican,democrat,democrat,democrat


In [10]:
# (6) Split the Raw Features and Labelled DataFrame into a Training DataFrame and a Test DataFrame
train_df, test_df = congressional_voting_features_df.randomSplit([0.75, 0.25], seed=12345)
train_df.count(), test_df.count()

(331, 104)

In [11]:
# (7) Train a Classification Tree Model on the Training DataFrame
decision_tree = DecisionTreeClassifier(featuresCol = 'features', labelCol = label_column)
decision_tree_model = decision_tree.fit(train_df)

In [12]:
# (8) Apply the Trained Classification Tree Model to the Test DataFrame to make predictions
test_decision_tree_predictions_df = decision_tree_model.transform(test_df)
print("TEST DATASET PREDICTIONS AGAINST ACTUAL LABEL: ")
test_decision_tree_predictions_df.select("probability", "rawPrediction", "prediction", label_column, "features").show()

TEST DATASET PREDICTIONS AGAINST ACTUAL LABEL: 
+--------------------+-------------+----------+-----+--------------------+
|         probability|rawPrediction|prediction|label|            features|
+--------------------+-------------+----------+-----+--------------------+
|[0.99459459459459...|  [184.0,1.0]|       0.0|  0.0|(32,[0,2,4,6,8,10...|
|[0.99459459459459...|  [184.0,1.0]|       0.0|  0.0|(32,[0,2,4,6,9,11...|
|[0.99459459459459...|  [184.0,1.0]|       0.0|  0.0|(32,[0,2,4,6,9,11...|
|[0.99459459459459...|  [184.0,1.0]|       0.0|  0.0|(32,[0,2,4,6,9,12...|
|           [0.0,1.0]|   [0.0,10.0]|       1.0|  1.0|(32,[0,2,4,7,8,10...|
|[0.08333333333333...|   [1.0,11.0]|       1.0|  1.0|(32,[0,2,5,7,8,10...|
|           [0.0,1.0]|   [0.0,94.0]|       1.0|  1.0|(32,[0,2,5,7,8,10...|
|[0.08333333333333...|   [1.0,11.0]|       1.0|  1.0|(32,[0,2,5,7,8,10...|
|           [0.0,1.0]|   [0.0,94.0]|       1.0|  1.0|(32,[0,2,5,7,8,10...|
|           [0.0,1.0]|   [0.0,94.0]|       1.0|  1.0

In [13]:
# (9) Evaluate the performance of our Classification Tree Model on the Test DataFrame using Area under a ROC curve
evaluator_roc_area = BinaryClassificationEvaluator(rawPredictionCol = "rawPrediction", labelCol = label_column, metricName = "areaUnderROC")
print("Area Under ROC Curve on Test Data = %g" % evaluator_roc_area.evaluate(test_decision_tree_predictions_df))

Area Under ROC Curve on Test Data = 0.90927


In [14]:
# (10) Visualise the Classification Tree
print(str(decision_tree_model.toDebugString))

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3b4bcc59332a, depth=5, numNodes=31, numClasses=2, numFeatures=32
  If (feature 7 in {0.0})
   If (feature 6 in {1.0})
    Predict: 0.0
   Else (feature 6 not in {1.0})
    If (feature 20 in {0.0})
     If (feature 14 in {1.0})
      Predict: 0.0
     Else (feature 14 not in {1.0})
      If (feature 0 in {0.0})
       Predict: 0.0
      Else (feature 0 not in {0.0})
       Predict: 1.0
    Else (feature 20 not in {0.0})
     Predict: 1.0
  Else (feature 7 not in {0.0})
   If (feature 21 in {1.0})
    If (feature 5 in {0.0})
     If (feature 12 in {0.0})
      Predict: 0.0
     Else (feature 12 not in {0.0})
      Predict: 1.0
    Else (feature 5 not in {0.0})
     If (feature 8 in {0.0})
      Predict: 0.0
     Else (feature 8 not in {0.0})
      If (feature 24 in {0.0})
       Predict: 0.0
      Else (feature 24 not in {0.0})
       Predict: 1.0
   Else (feature 21 not in {1.0})
    If (feature 29 in {1.0})
     If (feature 18

In [15]:
# (11) Train a Random Forest Classifier Model on the Training DataFrame
random_forest = RandomForestClassifier(featuresCol = 'features', labelCol = label_column)
random_forest_model = random_forest.fit(train_df)

In [16]:
# (12) Apply the Trained Random Forest Classifier Model to the Test DataFrame to make predictions
test_random_forest_predictions_df = random_forest_model.transform(test_df)
print("TEST DATASET PREDICTIONS AGAINST ACTUAL LABEL: ")
test_random_forest_predictions_df.select("probability", "rawPrediction", "prediction", label_column, "features").show()

TEST DATASET PREDICTIONS AGAINST ACTUAL LABEL: 
+--------------------+--------------------+----------+-----+--------------------+
|         probability|       rawPrediction|prediction|label|            features|
+--------------------+--------------------+----------+-----+--------------------+
|[0.94136183356285...|[18.8272366712571...|       0.0|  0.0|(32,[0,2,4,6,8,10...|
|[0.99664449498030...|[19.9328898996061...|       0.0|  0.0|(32,[0,2,4,6,9,11...|
|[0.99664449498030...|[19.9328898996061...|       0.0|  0.0|(32,[0,2,4,6,9,11...|
|[0.99664449498030...|[19.9328898996061...|       0.0|  0.0|(32,[0,2,4,6,9,12...|
|[0.17506228048673...|[3.50124560973471...|       1.0|  1.0|(32,[0,2,4,7,8,10...|
|[0.35666020140460...|[7.13320402809206...|       1.0|  1.0|(32,[0,2,5,7,8,10...|
|[0.05936769253220...|[1.18735385064406...|       1.0|  1.0|(32,[0,2,5,7,8,10...|
|[0.06526364701835...|[1.30527294036703...|       1.0|  1.0|(32,[0,2,5,7,8,10...|
|[0.11421157090776...|[2.28423141815523...|       

In [17]:
# (13) Evaluate the performance of our Random Forest Classifier Model on the Test DataFrame using Area under a ROC curve
evaluator_rf_roc_area = BinaryClassificationEvaluator(rawPredictionCol = "rawPrediction", labelCol = label_column, metricName = "areaUnderROC")
print("Area Under ROC Curve on Test Data = %g" % evaluator_rf_roc_area.evaluate(test_random_forest_predictions_df))

Area Under ROC Curve on Test Data = 0.988955


In [None]:
# (14) Stop the Spark Context
spark.stop()