In [1]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer


spark = SparkSession.builder.appName("PySparkFashionMNIST").getOrCreate()

# Load test data
trainDF =  spark.read.csv("hdfs://namenode:9000/data/TicTacToe.csv", header = True)
trainDF = trainDF.select([col(column).cast(IntegerType()).alias(column) for column in trainDF.columns])
trainDF.show(10)
label_index = StringIndexer(inputCol="Condition", outputCol="Condition_index")
trainDF = label_index.fit(trainDF).transform(trainDF)
trainDF.show(10)




+---+---+-----+----+----+---+-----+-----+----+---------+
|one|two|three|four|five|si1|seven|eight|nine|Condition|
+---+---+-----+----+----+---+-----+-----+----+---------+
|  1|  1|    1|   1|   0|  0|    1|    0|   0|        1|
|  1|  1|    1|   1|   0|  0|    0|    1|   0|        1|
|  1|  1|    1|   1|   0|  0|    0|    0|   1|        1|
|  1|  1|    1|   1|   0|  0|    0|    3|   3|        1|
|  1|  1|    1|   1|   0|  0|    3|    0|   3|        1|
|  1|  1|    1|   1|   0|  0|    3|    3|   0|        1|
|  1|  1|    1|   1|   0|  3|    0|    0|   3|        1|
|  1|  1|    1|   1|   0|  3|    0|    3|   0|        1|
|  1|  1|    1|   1|   0|  3|    3|    0|   0|        1|
|  1|  1|    1|   1|   3|  0|    0|    0|   3|        1|
+---+---+-----+----+----+---+-----+-----+----+---------+
only showing top 10 rows

+---+---+-----+----+----+---+-----+-----+----+---------+---------------+
|one|two|three|four|five|si1|seven|eight|nine|Condition|Condition_index|
+---+---+-----+----+----+---+-

In [2]:
selected_cols = [col_name for col_name in trainDF.columns if col_name != "Condition"]

# # Создание вектора признаков
assembler = VectorAssembler(inputCols=selected_cols, outputCol="features")
trainDF_vector = assembler.transform(trainDF)

trainDF_vector = trainDF_vector.select("Condition_index", "features")
trainDF_vector.show(10)

+---------------+--------------------+
|Condition_index|            features|
+---------------+--------------------+
|            0.0|(10,[0,1,2,3,6],[...|
|            0.0|(10,[0,1,2,3,7],[...|
|            0.0|(10,[0,1,2,3,8],[...|
|            0.0|[1.0,1.0,1.0,1.0,...|
|            0.0|[1.0,1.0,1.0,1.0,...|
|            0.0|[1.0,1.0,1.0,1.0,...|
|            0.0|[1.0,1.0,1.0,1.0,...|
|            0.0|[1.0,1.0,1.0,1.0,...|
|            0.0|[1.0,1.0,1.0,1.0,...|
|            0.0|[1.0,1.0,1.0,1.0,...|
+---------------+--------------------+
only showing top 10 rows



In [3]:
(training_data, test_data) = trainDF_vector.randomSplit([0.8, 0.2],seed = 42)

evaluator = MulticlassClassificationEvaluator(labelCol="Condition_index", predictionCol="prediction", metricName="accuracy")

from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(labelCol="Condition_index", featuresCol="features")
gbt_model = gbt.fit(training_data)
gbt_prediction = gbt_model.transform(test_data)

gbt_prediction.select("prediction", "Condition_index", "features").show(5)
gbt_accuracy = evaluator.evaluate(gbt_prediction)
print("Gradient-boosted tree classifier [Accuracy] = %g"% (gbt_accuracy))
print("Gradient-boosted tree classifier [Error] = %g " % (1.0 - gbt_accuracy))

+----------+---------------+--------------------+
|prediction|Condition_index|            features|
+----------+---------------+--------------------+
|       0.0|            0.0|(10,[0,1,2,3,8],[...|
|       0.0|            0.0|(10,[0,1,2,5,6],[...|
|       0.0|            0.0|(10,[0,1,2,5,8],[...|
|       0.0|            0.0|(10,[0,1,4,5,8],[...|
|       0.0|            0.0|(10,[0,2,4,5,6],[...|
+----------+---------------+--------------------+
only showing top 5 rows

Gradient-boosted tree classifier [Accuracy] = 1
Gradient-boosted tree classifier [Error] = 0 


In [4]:
turnsDF =  spark.read.csv("hdfs://namenode:9000/data/turns.csv", header = True)
turnsDF = trainDF.select([col(column).cast(IntegerType()).alias(column) for column in turnsDF.columns])
label_index = StringIndexer(inputCol="Condition", outputCol="Condition_index")
turnsDF = label_index.fit(turnsDF).transform(turnsDF)

selected_cols = [col_name for col_name in turnsDF.columns if col_name != "Condition"]

# # Создание вектора признаков
assembler = VectorAssembler(inputCols=selected_cols, outputCol="features")
turnsDF_vector = assembler.transform(turnsDF)

turnsDF_vector = turnsDF_vector.select("Condition_index", "features")
turnsDF_vector.show(10)

prediction = gbt_model.transform(turnsDF_vector)
prediction.select("Condition_index", "prediction").show(10)

prediction_accuracy = evaluator.evaluate(prediction)
print("Gradient-boosted tree classifier [Accuracy] = %g"% (prediction_accuracy))
print("Gradient-boosted tree classifier [Error] = %g " % (1.0 - prediction_accuracy))

+---------------+--------------------+
|Condition_index|            features|
+---------------+--------------------+
|            0.0|(10,[0,1,2,3,6],[...|
|            0.0|(10,[0,1,2,3,7],[...|
|            0.0|(10,[0,1,2,3,8],[...|
|            0.0|[1.0,1.0,1.0,1.0,...|
|            0.0|[1.0,1.0,1.0,1.0,...|
|            0.0|[1.0,1.0,1.0,1.0,...|
|            0.0|[1.0,1.0,1.0,1.0,...|
|            0.0|[1.0,1.0,1.0,1.0,...|
|            0.0|[1.0,1.0,1.0,1.0,...|
|            0.0|[1.0,1.0,1.0,1.0,...|
+---------------+--------------------+
only showing top 10 rows

+---------------+----------+
|Condition_index|prediction|
+---------------+----------+
|            0.0|       0.0|
|            0.0|       0.0|
|            0.0|       0.0|
|            0.0|       0.0|
|            0.0|       0.0|
|            0.0|       0.0|
|            0.0|       0.0|
|            0.0|       0.0|
|            0.0|       0.0|
|            0.0|       0.0|
+---------------+----------+
only showing top 10 ro