In [None]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import col, when

chess1= spark.read.parquet("/data/doina/Lichess/lichess_db_standard_rated_2023-01.parquet")

chess1.printSchema()
'''
root
 |-- Black: string (nullable = true)
 |-- BlackElo: integer (nullable = true)
 |-- BlackRatingDiff: string (nullable = true)
 |-- ECO: string (nullable = true)
 |-- Event: string (nullable = true)
 |-- Moves: string (nullable = true)
 |-- Opening: string (nullable = true)
 |-- Result: string (nullable = true)
 |-- Site: string (nullable = true)
 |-- Termination: string (nullable = true)
 |-- TimeControl: string (nullable = true)
 |-- UTCDate: string (nullable = true)
 |-- UTCTime: string (nullable = true)
 |-- White: string (nullable = true)
 |-- WhiteElo: integer (nullable = true)
 |-- WhiteRatingDiff: string (nullable = true)
'''


chessBlunders = chess1.select( "Moves","Site","TimeControl", "BlackElo", "WhiteElo", (F.size(F.split(F.col("Moves"), r"\?\?")) - 1).alias("BlunderCount"))

chessBlundersMin1 = chessBlunders.filter(F.col("blunder_count") > 0).orderBy(F.col("BlunderCount").desc())
chessBlundersMin1.show(10)

'''
+--------------------+--------------------+-----------+--------+--------+------------+
|               Moves|                Site|TimeControl|BlackElo|WhiteElo|BlunderCount|
+--------------------+--------------------+-----------+--------+--------+------------+
|1. a4 { [%eval -0...|https://lichess.o...|       15+0|    2030|    1957|         267|
|1. b4 { [%eval 0....|https://lichess.o...|       15+0|    1967|    1935|         242|
|1. b3 { [%eval 0....|https://lichess.o...|       15+0|    1972|    1929|         240|
|1. a4 { [%eval -0...|https://lichess.o...|       15+0|    1946|    2039|         227|
|1. d4 { [%eval 0....|https://lichess.o...|       15+0|    1946|    2038|         211|
|1. a4 { [%eval -0...|https://lichess.o...|       15+0|    1948|    1956|         204|
|1. a4 { [%eval -0...|https://lichess.o...|       15+0|    1959|    2030|         196|
|1. d4 { [%eval 0....|https://lichess.o...|       15+0|    1959|    2029|         193|
|1. a3 { [%eval 0....|https://lichess.o...|       15+0|    2032|    1956|         183|
|1. g3 { [%eval 0....|https://lichess.o...|       15+0|    2036|    1949|         168|
+--------------------+--------------------+-----------+--------+--------+------------+

'''


'''
June 2023
chessBlundersMin2.show(10)
+--------------------+--------------------+-----------+--------+--------+------------+
|               Moves|                Site|TimeControl|BlackElo|WhiteElo|BlunderCount|
+--------------------+--------------------+-----------+--------+--------+------------+
|1. Nh3?! { [%eval...|https://lichess.o...|      180+2|    2323|    2398|         199|
|1. e4 { [%eval 0....|https://lichess.o...|      180+3|    1356|    1601|         156|
|1. e4 { [%eval 0....|https://lichess.o...|      90+45|    1004|    1035|         136|
|1. d4 { [%eval 0....|https://lichess.o...|     300+14|    2044|    1885|         134|
|1. e3 { [%eval 0....|https://lichess.o...|      300+0|     893|     897|         129|
|1. e4 { [%eval 0....|https://lichess.o...|      300+4|    1682|    1500|         127|
|1. d3 { [%eval 0....|https://lichess.o...|       30+0|     942|     988|         126|
|1. h4?! { [%eval ...|https://lichess.o...|      180+0|    1061|    1070|         114|
|1. d4 { [%eval 0....|https://lichess.o...|          -|    1500|    1500|         100|
|1. d4 { [%eval 0....|https://lichess.o...|     1500+0|     911|    1041|          98|
+--------------------+--------------------+-----------+--------+--------+------------+

'''



In [None]:


from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import col, when

chess23_01= spark.read.parquet("/data/doina/Lichess/lichess_db_standard_rated_2023-01.parquet")

chessMain = chess23_1.select( "Moves","Site","TimeControl","WhiteElo","BlackElo")

chessMain = chessMain.withColumn("WhiteMoves",F.trim(F.regexp_replace(F.col("Moves"),r"\d+\.{3}.*?(?=\d+\.\s|\Z)","")))
chessMain = chessMain.withColumn("BlackMoves",F.trim(F.regexp_replace(F.col("Moves"),r"\d+\.\s+.*?(?=\d+\.{3}\s|\Z)","")))

chessBlunders = chessMain.select( "Moves","Site","TimeControl","WhiteElo","WhiteMoves",(F.size(F.split(F.col("WhiteMoves"), r"\?\?")) - 1).alias("WhiteBlunderCount"),"BlackElo","BlackMoves",(F.size(F.split(F.col("BlackMoves"), r"\?\?")) - 1).alias("BlackBlunderCount"),(F.size(F.split(F.col("Moves"), r"\?\?")) - 1).alias("TotalBlunderCount"))
chessBlundersOrdered = chessBlunders.filter(F.col("TotalBlunderCount") > 0).orderBy(F.col("TotalBlunderCount").desc())

chessBlundersOrdered.show(10)

'''
+--------------------+--------------------+-----------+--------+--------------------+-----------------+--------+--------------------+-----------------+-----------------+
|               Moves|                Site|TimeControl|WhiteElo|          WhiteMoves|WhiteBlunderCount|BlackElo|          BlackMoves|BlackBlunderCount|TotalBlunderCount|
+--------------------+--------------------+-----------+--------+--------------------+-----------------+--------+--------------------+-----------------+-----------------+
|1. a4 { [%eval -0...|https://lichess.o...|       15+0|    1957|1. a4 { [%eval -0...|              135|    2030|1... d5 { [%eval ...|              132|              267|
|1. b4 { [%eval 0....|https://lichess.o...|       15+0|    1935|1. b4 { [%eval 0....|              122|    1967|1... h5 { [%eval ...|              120|              242|
|1. b3 { [%eval 0....|https://lichess.o...|       15+0|    1929|1. b3 { [%eval 0....|              118|    1972|1... d5 { [%eval ...|              122|              240|
|1. a4 { [%eval -0...|https://lichess.o...|       15+0|    2039|1. a4 { [%eval -0...|              113|    1946|1... h6 { [%eval ...|              114|              227|
|1. d4 { [%eval 0....|https://lichess.o...|       15+0|    2038|1. d4 { [%eval 0....|              109|    1946|1... h5?! { [%eva...|              102|              211|
|1. a4 { [%eval -0...|https://lichess.o...|       15+0|    1956|1. a4 { [%eval -0...|              102|    1948|1... g5? { [%eval...|              102|              204|
|1. a4 { [%eval -0...|https://lichess.o...|       15+0|    2030|1. a4 { [%eval -0...|               95|    1959|1... g5? { [%eval...|              101|              196|
|1. d4 { [%eval 0....|https://lichess.o...|       15+0|    2029|1. d4 { [%eval 0....|               98|    1959|1... h5?! { [%eva...|               95|              193|
|1. a3 { [%eval 0....|https://lichess.o...|       15+0|    1956|1. a3 { [%eval 0....|               93|    2032|1... d5 { [%eval ...|               90|              183|
|1. g3 { [%eval 0....|https://lichess.o...|       15+0|    1949|1. g3 { [%eval 0....|               85|    2036|1... d5 { [%eval ...|               83|              168|
+--------------------+--------------------+-----------+--------+--------------------+-----------------+--------+--------------------+-----------------+-----------------+

'''


chess23_04 = spark.read.parquet("/data/doina/Lichess/lichess_db_standard_rated_2023-04.parquet")

#same things

chessBlundersOrdered4.show(10)

'''
+--------------------+--------------------+-----------+--------+--------------------+-----------------+--------+--------------------+-----------------+-----------------+
|               Moves|                Site|TimeControl|WhiteElo|          WhiteMoves|WhiteBlunderCount|BlackElo|          BlackMoves|BlackBlunderCount|TotalBlunderCount|
+--------------------+--------------------+-----------+--------+--------------------+-----------------+--------+--------------------+-----------------+-----------------+
|1. f3?! { [%eval ...|https://lichess.o...|  10800+180|    1500|1. f3?! { [%eval ...|               96|    1500|1... e5 { [%eval ...|               97|              193|
|1. e4 { [%eval 0....|https://lichess.o...|          -|    1500|1. e4 { [%eval 0....|               82|    1286|1... e5 { [%eval ...|               77|              159|
|1. e4 { [%eval 0....|https://lichess.o...|      600+0|    1568|1. e4 { [%eval 0....|               74|    1247|1... e5 { [%eval ...|               72|              146|
|1. e4 { [%eval 0....|https://lichess.o...|          -|    1536|1. e4 { [%eval 0....|               63|    1482|1... e6 { [%eval ...|               65|              128|
|1. f3?! { [%eval ...|https://lichess.o...|       60+0|     815|1. f3?! { [%eval ...|               55|     870|1... e5 { [%eval ...|               55|              110|
|1. e4 { [%eval 0....|https://lichess.o...|       60+0|     961|1. e4 { [%eval 0....|               45|     893|1... b5? { [%eval...|               45|               90|
|1. f3?! { [%eval ...|https://lichess.o...|       15+0|    1500|1. f3?! { [%eval ...|               41|    1546|1... d5 { [%eval ...|               42|               83|
|1. e4 { [%eval 0....|https://lichess.o...|      180+0|    1082|1. e4 { [%eval 0....|               39|    1500|1... e5 { [%eval ...|               39|               78|
|1. d4 { [%eval 0....|https://lichess.o...|      300+0|     870|1. d4 { [%eval 0....|               37|     922|1... e5? { [%eval...|               37|               74|
|1. e4 { [%eval 0....|https://lichess.o...|      600+5|    1772|1. e4 { [%eval 0....|               33|    2057|1... c5 { [%eval ...|               34|               67|
+--------------------+--------------------+-----------+--------+--------------------+-----------------+--------+--------------------+-----------------+-----------------+
'''

chess23_07 = spark.read.parquet("/data/doina/Lichess/lichess_db_standard_rated_2023-07.parquet")

chessBlundersOrdered7.show(10)

'''
+--------------------+--------------------+-----------+--------+--------------------+-----------------+--------+--------------------+-----------------+-----------------+
|               Moves|                Site|TimeControl|WhiteElo|          WhiteMoves|WhiteBlunderCount|BlackElo|          BlackMoves|BlackBlunderCount|TotalBlunderCount|
+--------------------+--------------------+-----------+--------+--------------------+-----------------+--------+--------------------+-----------------+-----------------+
|1. a3 { [%eval -0...|https://lichess.o...|      600+5|    2419|1. a3 { [%eval -0...|              113|    2311|1... h6?! { [%eva...|              117|              230|
|1. g4? { [%eval -...|https://lichess.o...|      600+5|    1773|1. g4? { [%eval -...|               96|    1773|1... f5? { [%eval...|               94|              190|
|1. f3?! { [%eval ...|https://lichess.o...|      600+5|    1691|1. f3?! { [%eval ...|               52|    1096|1... Nf6 { [%eval...|               50|              102|
|1. e4 { [%eval 0....|https://lichess.o...|       60+0|    1422|1. e4 { [%eval 0....|               45|    1355|1... f6? { [%eval...|               45|               90|
|1. h3 { [%eval -0...|https://lichess.o...|       60+0|    1869|1. h3 { [%eval -0...|               33|    1817|1... a6 { [%eval ...|               34|               67|
|1. d4 { [%eval 0....|https://lichess.o...|      300+3|    1496|1. d4 { [%eval 0....|               32|    1422|1... e5? { [%eval...|               33|               65|
|1. f4 { [%eval -0...|https://lichess.o...|      240+4|     916|1. f4 { [%eval -0...|               31|     923|1... e5? { [%eval...|               32|               63|
|1. e4 { [%eval 0....|https://lichess.o...|       60+0|     791|1. e4 { [%eval 0....|               31|     749|1... d6 { [%eval ...|               30|               61|
|1. e4 { [%eval 0....|https://lichess.o...|      300+3|     578|1. e4 { [%eval 0....|               27|     594|1... e5 { [%eval ...|               32|               59|
|1. g4? { [%eval -...|https://lichess.o...|      660+0|    1111|1. g4? { [%eval -...|               28|    1046|1... e5 { [%eval ...|               30|               58|
+--------------------+--------------------+-----------+--------+--------------------+-----------------+--------+--------------------+-----------------+-----------------+
'''
chess23_10 = spark.read.parquet("/data/doina/Lichess/lichess_db_standard_rated_2023-10.parquet")

chessBlundersOrdered10.show(10)

'''
+--------------------+--------------------+-----------+--------+--------------------+-----------------+--------+--------------------+-----------------+-----------------+
|               Moves|                Site|TimeControl|WhiteElo|          WhiteMoves|WhiteBlunderCount|BlackElo|          BlackMoves|BlackBlunderCount|TotalBlunderCount|
+--------------------+--------------------+-----------+--------+--------------------+-----------------+--------+--------------------+-----------------+-----------------+
|1. e4 { [%eval 0....|https://lichess.o...|    1800+30|    1483|1. e4 { [%eval 0....|              116|    1695|1... e5 { [%eval ...|              116|              232|
|1. e4 { [%eval 0....|https://lichess.o...|      600+0|    2117|1. e4 { [%eval 0....|               69|    1776|1... e5 { [%eval ...|               70|              139|
|1. e3 { [%eval 0....|https://lichess.o...|       30+0|    2078|1. e3 { [%eval 0....|               63|    2038|1... d6 { [%eval ...|               60|              123|
|1. Nf3 { [%eval 0...|https://lichess.o...|      600+0|     446|1. Nf3 { [%eval 0...|               34|     577|1... Nc6 { [%eval...|               38|               72|
|1. e4 { [%eval 0....|https://lichess.o...|      600+0|     941|1. e4 { [%eval 0....|               30|     895|1... e5 { [%eval ...|               32|               62|
|1. f4 { [%eval -0...|https://lichess.o...|      300+3|    1742|1. f4 { [%eval -0...|               26|    1258|1... c5 { [%eval ...|               32|               58|
|1. e4 { [%eval 0....|https://lichess.o...|      600+5|    2055|1. e4 { [%eval 0....|               29|    2186|1... c5 { [%eval ...|               28|               57|
|1. g4? { [%eval -...|https://lichess.o...|      240+0|     983|1. g4? { [%eval -...|               28|    1320|1... a5? { [%eval...|               28|               56|
|1. e4 { [%eval 0....|https://lichess.o...|      180+0|     902|1. e4 { [%eval 0....|               28|     999|1... a5?! { [%eva...|               27|               55|
|1. e4 { [%eval 0....|https://lichess.o...|      180+0|    1183|1. e4 { [%eval 0....|               26|     977|1... e5 { [%eval ...|               28|               54|
+--------------------+--------------------+-----------+--------+--------------------+-----------------+--------+--------------------+-----------------+-----------------+
'''
chess23_12 = spark.read.parquet("/data/doina/Lichess/lichess_db_standard_rated_2023-12.parquet")

chessBlundersOrdered12.show(10)

'''
+--------------------+--------------------+-----------+--------+--------------------+-----------------+--------+--------------------+-----------------+-----------------+
|               Moves|                Site|TimeControl|WhiteElo|          WhiteMoves|WhiteBlunderCount|BlackElo|          BlackMoves|BlackBlunderCount|TotalBlunderCount|
+--------------------+--------------------+-----------+--------+--------------------+-----------------+--------+--------------------+-----------------+-----------------+
|1. g3 { [%eval 0....|https://lichess.o...|       60+0|    2115|1. g3 { [%eval 0....|              287|    2073|1... b6 { [%eval ...|              287|              574|
|1. d4 { [%eval 0....|https://lichess.o...|       60+1|    1392|1. d4 { [%eval 0....|              257|    1375|1... d5 { [%eval ...|              258|              515|
|1. d4 { [%eval 0....|https://lichess.o...|      180+0|    1895|1. d4 { [%eval 0....|              254|    1905|1... Nf6 { [%eval...|              254|              508|
|1. e3 { [%eval 0....|https://lichess.o...|       0+12|    1501|1. e3 { [%eval 0....|              207|    1825|1... d5 { [%eval ...|              207|              414|
|1. e4 { [%eval 0....|https://lichess.o...|      180+0|    1983|1. e4 { [%eval 0....|              184|    2052|1... e5 { [%eval ...|              184|              368|
|1. d4 { [%eval 0....|https://lichess.o...|      180+2|    2383|1. d4 { [%eval 0....|              156|    2340|1... Nf6 { [%eval...|              155|              311|
|1. e4 { [%eval 0....|https://lichess.o...|      180+2|    1665|1. e4 { [%eval 0....|              154|    1582|1... e5 { [%eval ...|              156|              310|
|1. d4 { [%eval 0....|https://lichess.o...|      900+0|    1588|1. d4 { [%eval 0....|              155|    1832|1... d5 { [%eval ...|              154|              309|
|1. e4 { [%eval 0....|https://lichess.o...|     1800+0|    2156|1. e4 { [%eval 0....|              154|    2197|1... e5 { [%eval ...|              153|              307|
|1. e4 { [%eval 0....|https://lichess.o...|      180+0|    2503|1. e4 { [%eval 0....|              139|    2461|1... c5 { [%eval ...|              140|              279|
+--------------------+--------------------+-----------+--------+--------------------+-----------------+--------+--------------------+-----------------+-----------------+
'''
