In [None]:
spark

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType, StringType, IntegerType
from pyspark.ml.clustering import KMeans
import chess
from pyspark.ml.linalg import Vectors, VectorUDT
import numpy as np

In [None]:
raw = spark.read.format("parquet").option("header",True).load("data/chess_dataframe_parquet/*.parquet")

In [None]:
raw_small = spark.read.format("parquet").option("header",True).load("data/chess_dataframe_parquet/*.parquet").limit(100000)

In [None]:
raw.columns

In [None]:
def splitter(moves_string):
    moves = []
    for i in moves_string.split(" "):
        if "." not in i:
            moves.append(i)
    return moves
sp_udf = F.udf(splitter, ArrayType(StringType()))

In [None]:
temp = raw_small.where(F.col("moves").isNotNull()).withColumn("arr", sp_udf(F.col("Moves")))
for i in range(10):
    temp = temp.withColumn("m{}".format(i), F.col("arr").getItem(i))
temp.show()
temp.cache()

In [None]:
indexer = StringIndexer(inputCol="Termination", outputCol="TerminationIndex")
indexed = indexer.fit(temp).transform(temp)

indexer = StringIndexer(inputCol="Result", outputCol="ResultIndex")
indexed = indexer.fit(indexed).transform(indexed)

indexer = StringIndexer(inputCol="Opening", outputCol="OpeningIndex")
indexed = indexer.fit(indexed).transform(indexed)

for i in range(10):
    print("start", i)
    indexer = StringIndexer(inputCol="m{}".format(i), outputCol="m{}Index".format(i))
    indexed = indexer.fit(indexed).transform(indexed)
    print("end", i)
# indexed.show()
indexed = indexed.withColumn("BElo", F.col("BlackElo").cast(IntegerType()))
indexed = indexed.withColumn("WElo", F.col("WhiteElo").cast(IntegerType()))
indexed.take(1)

In [None]:
vecAssembler = VectorAssembler(inputCols=["m0Index", "m1Index", "m2Index", "m3Index",  
                                          "m4Index", "m5Index", "m6Index", "m7Index", 
                                          "m8Index", "m9Index"], outputCol="features")
df = vecAssembler.transform(indexed2)

In [None]:
df2 = df.where(F.size(F.col("arr")) > 10)
df2.cache()

In [None]:
kmeans = KMeans(k=5, seed=1)  # 2 clusters here
model = kmeans.fit(df2.select('features'))

In [None]:
transformed = model.transform(df2)
transformed.select("Opening", "Result", "prediction").show(truncate=False)   

In [None]:
transformed.where(F.col("prediction") == 0).select(F.countDistinct("Opening")).show()

In [None]:
single_clus = transformed.groupby(F.col("Opening")).agg(F.countDistinct("prediction").alias("pred")) \
                        .where(F.col("pred") == 1)

In [None]:
joined = single_clus.join(transformed, on=["Opening"], how="inner")

In [None]:
res = joined.where(F.col("prediction") == 2).select("opening").dropDuplicates()
res.show(50, truncate=False)

In [None]:
long_games = temp.where(F.size(F.col("arr")) > 10)
move_opening = long_games.select("opening", "m0", "m1", "m2", "m3", "m4", "m5", "m6", "m7", "m8", "m9")

In [None]:
def simulator(moves):
    board = chess.Board()
    for move in moves:
        board.push_san(move)
    result_arr_w = []
    result_arr_b = []
    for i in range(24, 40):
        attacker_count = len(list(board.attackers(True, i)))
        result_arr_w.append(attacker_count)
        attacker_count = len(list(board.attackers(False, i)))
        result_arr_b.append(attacker_count)
    return Vectors.dense(result_arr_w), Vectors.dense(result_arr_b)

sim_udf = F.udf(lambda x: simulator(x), ArrayType(VectorUDT()))

In [None]:
attacks = move_opening.withColumn("attacks", sim_udf(F.array("m0", "m1", "m2", "m3", "m4", "m5", "m6", "m7", "m8", "m9"))).select("opening", "attacks")

In [None]:
final = attacks.select(attacks.opening, attacks.attacks[0].alias("white_att"), attacks.attacks[1].alias("black_att"))

In [None]:
indexer2 = StringIndexer(inputCol="opening", outputCol="openingIndex")
indexed = indexer2.fit(final).transform(final)

assembler = VectorAssembler(
    inputCols=["white_att", "black_att", "openingIndex"],
    outputCol="features")
assembled = assembler.transform(indexed)

In [None]:
cost = np.zeros(20)
for k in range(2,20):
    kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
    model = kmeans.fit(assembled.sample(False,0.1, seed=42))
    cost[k] = model.computeCost(assembled)

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,1, figsize =(20,16))
ax.plot(range(2,20),cost[2:20])
ax.set_xlabel('k')
ax.set_ylabel('cost')

In [None]:
kmeans = KMeans(k=8, seed=1)  # 2 clusters here
model = kmeans.fit(assembled.select("features"))

In [107]:
transformed = model.transform(assembled)
transformed.select("Opening", "prediction").show(truncate=False)   

+----------------------------------------------------------+----------+
|Opening                                                   |prediction|
+----------------------------------------------------------+----------+
|Nimzowitsch Defense: Kennedy Variation, Paulsen Attack    |6         |
|Queen's Indian Defense: Fianchetto Traditional            |6         |
|Sicilian Defense: Kan Variation, Knight Variation         |5         |
|Colle System: Traditional Colle                           |2         |
|Modern Defense: Standard Line                             |5         |
|Gedult's Opening                                          |0         |
|Four Knights Game: Spanish Variation, Rubinstein Variation|4         |
|Indian Game: Anti-Nimzo-Indian                            |7         |
|Italian Game: Classical Variation, Giuoco Pianissimo      |5         |
|Sicilian Defense: Canal-Sokolsky Attack                   |5         |
|Catalan Opening: Closed Variation                         |5   

In [108]:
single_clus = transformed.groupby(F.col("Opening")).agg(F.countDistinct("prediction").alias("pred")) \
                        .where(F.col("pred") == 1)
joined = single_clus.join(transformed, on=["Opening"], how="inner")
res = joined.where(F.col("prediction") == 2).select("opening").dropDuplicates()
res.show(50, truncate=False)

+-------------------------------------------------------------------------------------+
|opening                                                                              |
+-------------------------------------------------------------------------------------+
|Queen's Gambit Accepted: Classical Defense, Steinitz Variation, Development Variation|
|Caro-Kann Defense: Accelerated Panov Attack, Open Variation                          |
|Alekhine Defense: Modern Variation, Larsen-Haakert Variation                         |
|King's Indian Defense: Averbakh Variation, Flexible Defense                          |
|Grünfeld Defense: Three Knights Variation, Burille Variation, Reversed Tarrasch      |
|Sicilian Defense: Kupreichik Variation                                               |
|Queen's Gambit Refused: Albin Countergambit, Modern Line                             |
|Queen's Gambit Declined: Orthodox Defense, Rubinstein Variation                      |
|Sicilian Defense: Scheveningen 

In [109]:
res = joined.where(F.col("prediction") == 1).select("opening").dropDuplicates()
res.show(50, truncate=False)

+------------------------------------------------------------------------+
|opening                                                                 |
+------------------------------------------------------------------------+
|Tarrasch Defense: Rubinstein System                                     |
|Evans Gambit, Morphy Attack                                             |
|Modern Defense: Randspringer Variation                                  |
|Evans Gambit, Leonhardt Countergambit                                   |
|Ruy Lopez: Classical Variation, Modern Main Line                        |
|King's Gambit, Falkbeer Countergambit, Blackburne Attack                |
|Queen's Gambit Declined: Semi-Tarrasch, San Sebastian Variation         |
|Ruy Lopez: Berlin Defense, Tarrasch Trap                                |
|Blackmar-Diemer Gambit Declined, Elbert Countergambit                   |
|Queen's Gambit Declined: Lasker Defense, Main Line                      |
|Sicilian Defense: O'Kell