### Case study of the use of Random Forest via Spark for the prediction of the cause of the end of a certain chess game.

#### First, we will import the necessary packages and the csv containing the detaset courtesy of Mitchell J (https://mitchelljolly.com/)

In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import (VectorAssembler, VectorIndexer, 
                                StringIndexer, OneHotEncoder)
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
spark = SparkSession.builder.appName('chessApp').getOrCreate()
data = spark.read.csv('/FileStore/tables/games.csv',
                      inferSchema=True,
                      header=True)

#### Now, we can get information about the dataset

In [0]:
data.show()

In [0]:
data.printSchema()

#### We are going to stay only with the columns that interest us

In [0]:
data_selected = data.select('rated', 'turns', 'victory_status', 'winner', 'increment_code', 'white_rating', 'black_rating', 'opening_name')

In [0]:
data_selected.show()

#### Next, we treat those variables that are categorical.

In [0]:
victory_Indexer = StringIndexer(inputCol='victory_status', outputCol='victory_status_Index')
data_selected = victory_Indexer.fit(data_selected).transform(data_selected)

In [0]:
winner_Indexer = StringIndexer(inputCol='winner', outputCol='winner_Index')
winner_Vec = OneHotEncoder(inputCol='winner_Index', outputCol='winner_Vec')
data_selected = winner_Indexer.fit(data_selected).transform(data_selected)
data_selected = winner_Vec.fit(data_selected).transform(data_selected)

In [0]:
opening_Indexer = StringIndexer(inputCol='opening_name', outputCol='opening_name_Index')
opening_Vec = OneHotEncoder(inputCol='opening_name_Index', outputCol='opening_name_Vec')
data_selected = opening_Indexer.fit(data_selected).transform(data_selected)
data_selected = opening_Vec.fit(data_selected).transform(data_selected)

In [0]:
increment_Indexer = StringIndexer(inputCol='increment_code', outputCol='increment_code_Index')
increment_Vec = OneHotEncoder(inputCol='increment_code_Index', outputCol='increment_code_Vec')
data_selected = increment_Indexer.fit(data_selected).transform(data_selected)
data_selected = increment_Vec.fit(data_selected).transform(data_selected)

#### Applying the VectorAssembler

In [0]:
data_selected.columns

In [0]:
assembler = VectorAssembler(inputCols=['rated','turns','winner_Vec','increment_code_Vec','white_rating', 'black_rating', 'opening_name_Vec'],
                           outputCol='features')

In [0]:
final_data = assembler.transform(data_selected)

In [0]:
final_data.select('victory_status_Index','features').show()

#### Specifying the parameters of the Random Forest

In [0]:
random_forest_classifier = RandomForestClassifier(labelCol='victory_status_Index', featuresCol='features')

#### Splitting into train and test data

In [0]:
train_data ,test_data = final_data.randomSplit([0.7,0.3])

In [0]:
train_data.show()

#### Training and testing the model

In [0]:
fit_model = random_forest_classifier.fit(train_data)

In [0]:
results = fit_model.transform(test_data)

#### Finally, we can get the acuraccy or our model and the importance of each variable

In [0]:
my_eval = MulticlassClassificationEvaluator(labelCol='victory_status_Index', metricName='accuracy')

In [0]:
results_evaluated = my_eval.evaluate(results)
print(results_evaluated)