In [1]:
!pip install pyspark
!pip install findspark

# Import SparkSession
from pyspark.sql import SparkSession

# Create a Spark Session
spark = SparkSession.builder.master("local[*]").getOrCreate()

# Check Spark Session Information
spark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824025 sha256=2b72813c4c151704b589e4986a9b14c655e3e3ee4df935cac76726e09756f1aa
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [2]:
# (1) Import the required Python dependencies
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
# (2) Instantiate a Spark Context
sqlContext = SQLContext(spark)



In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# (3) Load the Letter Recognition Dataset (in CSV format with pre-defined label and features columns)
# (3.1) Create Feature Vectors from the 16 features
# (3.2) Rename the 'lettr' column to 'label' which is a number representing one of the 26 characters in the English alphabet

letter_recognition_df = sqlContext.read.format('com.databricks.spark.csv').options(header = 'true', inferschema = 'true').load('/content/drive/MyDrive/letter-recognition.csv')
feature_columns = ['x-box','y-box','width','high','onpix','x-bar','y-bar','x2bar','y2bar','xybar','x2ybr','xy2br','x-ege','xegvy','y-ege','yegvx']
vector_assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'features')
vectorised_df = vector_assembler.transform(letter_recognition_df).withColumnRenamed('lettr', 'label').select('label', 'features')
vectorised_df.show(10, False)

+-----+----------------------------------------------------------------------+
|label|features                                                              |
+-----+----------------------------------------------------------------------+
|19   |[2.0,8.0,3.0,5.0,1.0,8.0,13.0,0.0,6.0,6.0,10.0,8.0,0.0,8.0,0.0,8.0]   |
|8    |[5.0,12.0,3.0,7.0,2.0,10.0,5.0,5.0,4.0,13.0,3.0,9.0,2.0,8.0,4.0,10.0] |
|3    |[4.0,11.0,6.0,8.0,6.0,10.0,6.0,2.0,6.0,10.0,3.0,7.0,3.0,7.0,3.0,9.0]  |
|13   |[7.0,11.0,6.0,6.0,3.0,5.0,9.0,4.0,6.0,4.0,4.0,10.0,6.0,10.0,2.0,8.0]  |
|6    |[2.0,1.0,3.0,1.0,1.0,8.0,6.0,6.0,6.0,6.0,5.0,9.0,1.0,7.0,5.0,10.0]    |
|18   |[4.0,11.0,5.0,8.0,3.0,8.0,8.0,6.0,9.0,5.0,6.0,6.0,0.0,8.0,9.0,7.0]    |
|1    |[4.0,2.0,5.0,4.0,4.0,8.0,7.0,6.0,6.0,7.0,6.0,6.0,2.0,8.0,7.0,10.0]    |
|0    |[1.0,1.0,3.0,2.0,1.0,8.0,2.0,2.0,2.0,8.0,2.0,8.0,1.0,6.0,2.0,7.0]     |
|9    |[2.0,2.0,4.0,4.0,2.0,10.0,6.0,2.0,6.0,12.0,4.0,8.0,1.0,6.0,1.0,7.0]   |
|12   |[11.0,15.0,13.0,9.0,7.0,13.0,2.0,6.0,2.0,12.0

In [7]:
# (4) Split the Featurised DataFrame into a Training DataFrame and a Test DataFrame
train_df, test_df = vectorised_df.randomSplit([0.75, 0.25], seed=12345)
train_df.count(), test_df.count()

(14928, 5072)

In [8]:
# (5) Specify the layers for our Neural Network
# (5.1) The 1st element in this list represents the size of the Input Layer. In our case, we have 16 features
# (5.2) The next elements in the list represent the sizes of the intermediate Hidden Layers, in our case 8 and 4
# (5.3) The final element in this list represents the size of the Output. In our case, we have 26 classes
layers = [16, 8, 4, 26]

In [9]:
# (6) Train a Multilayer Perceptron Classifier using our list representing our layers from input to output layers
multilayer_perceptron_classifier = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
multilayer_perceptron_classifier_model = multilayer_perceptron_classifier.fit(train_df)

In [10]:
# (7) Apply the Trained Multilayer Perceptron Classifier Model to the Test DataFrame to make predictions
test_predictions_df = multilayer_perceptron_classifier_model.transform(test_df)
print("TEST DATASET PREDICTIONS AGAINST ACTUAL LABEL: ")
test_predictions_df.select("label", "features", "probability", "prediction").show()

TEST DATASET PREDICTIONS AGAINST ACTUAL LABEL: 
+-----+--------------------+--------------------+----------+
|label|            features|         probability|prediction|
+-----+--------------------+--------------------+----------+
|    0|[1.0,0.0,2.0,0.0,...|[0.15619111535018...|       0.0|
|    0|[1.0,1.0,2.0,1.0,...|[0.15893128477663...|       0.0|
|    0|[1.0,1.0,2.0,1.0,...|[0.15959651436080...|       0.0|
|    0|[1.0,1.0,2.0,1.0,...|[0.15981651802448...|       0.0|
|    0|[1.0,1.0,3.0,2.0,...|[0.16029609377926...|       0.0|
|    0|[1.0,3.0,2.0,2.0,...|[0.16067052540085...|       0.0|
|    0|[1.0,3.0,2.0,2.0,...|[0.16033461316887...|       0.0|
|    0|[2.0,1.0,3.0,2.0,...|[0.16029077174310...|       0.0|
|    0|[2.0,1.0,4.0,2.0,...|[0.16022812484398...|       0.0|
|    0|[2.0,2.0,4.0,4.0,...|[0.16029862050764...|       0.0|
|    0|[2.0,3.0,3.0,1.0,...|[0.16028610094273...|       0.0|
|    0|[2.0,3.0,3.0,1.0,...|[0.16030575431848...|       0.0|
|    0|[2.0,3.0,4.0,2.0,...|[0.160290

In [11]:
# (8) Compute the accuracy of our Trained Multilayer Perceptron Classifier Model on the Test DataFrame
prediction_and_labels = test_predictions_df.select("prediction", "label")
accuracy_evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(metricName="weightedRecall")
print("Accuracy on Test Dataset = %g" % accuracy_evaluator.evaluate(prediction_and_labels))
print("Precision on Test Dataset = %g" % precision_evaluator.evaluate(prediction_and_labels))
print("Recall on Test Dataset = %g" % recall_evaluator.evaluate(prediction_and_labels))

Accuracy on Test Dataset = 0.250197
Precision on Test Dataset = 0.213911
Recall on Test Dataset = 0.250197


In [12]:
# (9) To improve the accuracy of our model, let us increase the size of the Hidden Layers
new_layers = [16, 16, 12, 26]
new_multilayer_perceptron_classifier = MultilayerPerceptronClassifier(maxIter=400, layers=new_layers, blockSize=128, seed=1234)
new_multilayer_perceptron_classifier_model = new_multilayer_perceptron_classifier.fit(train_df)
new_test_predictions_df = new_multilayer_perceptron_classifier_model.transform(test_df)
print("New Accuracy on Test Dataset = %g" % accuracy_evaluator.evaluate(new_test_predictions_df.select("prediction", "label")))

New Accuracy on Test Dataset = 0.640773


In [None]:
# (10) Stop the Spark Context
spark.stop()