# Analytics
 
The final data model includes one fact table:

1) `games`

— and four dimension tables:

1) `player`
2) `opening`
3) `platform`
4) `time_class`

These tables are loaded below so that we can run two sample analytics on them.


In [1]:
import pandas as pd
import yaml, os
import boto3
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark import SQLContext

In [2]:
#Load settings from yaml file
with open(r'config/dl-chessdotcom.yaml') as file:
    config = yaml.load(file,Loader=yaml.SafeLoader)

os.environ['AWS_ACCESS_KEY_ID']=config['aws_access_key_id']
os.environ['AWS_SECRET_ACCESS_KEY']=config['aws_secret_key_id']

In [3]:
spark = SparkSession \
    .builder \
    .appName("Run Analytics on Lichess/Chess.com fact/dim tables") \
    .getOrCreate()

spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key",config['aws_access_key_id'])
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key",config['aws_secret_key_id'])

In [4]:
# Set which outpath path we need - either local or s3.
# output_data = config['output_data_path_local']
output_data = "s3a://" + config['output_data_path_s3']

# Load Dimension Tables

### Load `player` dim table

In [5]:
df_player = spark.read.parquet(output_data + "/dim/player/")

In [6]:
df_player.limit(10).toPandas()

Unnamed: 0,id,username
0,473e6736e1f251d6a6d3220b5102d814a4c98975,daanbrandenburg
1,20aa1ce2708e1eeea4133a80c8e0ff7336ae8748,LPSupi
2,d942514369054b1b6b9f694d782d9c61741856a3,HansOnTwitch
3,04f40ab2a981346b6760409e0ea4f30c4687d3c1,Vasily0488
4,4a005c9b160128716ba471b6bdcf87867ceb2bcb,DAVIDZIROYAN
5,a2d71ac8ac3769900d45268f22bb544910111c43,bilelou
6,47f27131c75b2bbfbeebfb382c33915f9d15f747,UlvinGaming
7,71b6b06fef5d8aa08d2dca213409b8100c826aa4,Kruci96
8,7e06d2fe649c478e9b07770693ed35781b3feefa,sion2200
9,b62b5397905fb7bbe5294264049e64228c57ff33,MomchilPetkov05


In [12]:
df_player.createOrReplaceTempView("player")

### Load `opening` dim table

In [30]:
df_opening = spark.read.parquet(output_data + "/dim/opening/")


In [31]:
df_opening.limit(10).toPandas()

Unnamed: 0,opening_id,opening
0,52c1048b411bc4ce3885fa4406b07dfeaa8bafce,Zukertort Opening: Queen's Gambit Invitation
1,47a0d6bccb0cefe4a536c1eb88de687a7c9206f5,Kings Pawn Opening Owens Defense 2.d4 Bb7
2,42b1b40e0825eb249201d82259b80213899310fd,French Defense Exchange Variation...4.Bd3 Bd6 ...
3,c91c022eee9ca8017a62676c793d02dce9b3e138,Sicilian Defense Kan Variation 5.Be2 Nf6
4,4010a8e7dd2afd0f484560eec1a0434f4dce64a1,Kings Indian Attack 2...Nc6 3.Bg2 e5
...,...,...
95,6adb9d6cd2960e46ec3bb86248b8284e967575fe,King's Indian Defense: Orthodox Variation
96,8c2fb78511b8c0be50673aa2265f58994668ea4b,Benko Gambit Fully Accepted 5...g6
97,fc255df423a14bf3bc619cf955978cadca45f3d3,Ruy Lopez Opening Morphy Defense Chigorin Defe...
98,e8236206c70f6c7024a9ebc2c3174d2de8950847,Petrovs Defense Classical Cozio Attack


In [66]:
df_opening.createOrReplaceTempView("opening")

### Load `time_class` dim table

In [33]:
df_time_class = spark.read.parquet(output_data + "/dim/time_class/")


In [34]:
df_time_class.limit(10).toPandas()

Unnamed: 0,time_class_id,time_class
0,f915e10481634c0a44492699b2b8a3657c334106,correspondence
1,7bff1b790fcdfa5016c20a07a145631da3fe3cfa,ultraBullet
2,4a19573b7e72b7249d2271839c762ffe1f0452f3,classical
3,472da2b94e9fa87badd16a55e1eaec4f53ffc52a,bullet
4,dc94ac81aa982e5382b814d4d883bfdb2ed62ddf,rapid
5,2fe14b9b993ea6eb953f8129c7a1edede9792b77,daily
6,6d199aca996a9a8bff542e2bc10e3f0edc62cd07,blitz


In [35]:
df_time_class.createOrReplaceTempView("time_class")

### Load `platform` dim table

In [36]:
df_platform = spark.read.parquet(output_data + "/dim/platform/")


In [37]:
df_platform.limit(10).toPandas()

Unnamed: 0,platform_id,platform
0,224fccb5995aa2f6fd208d59570e06a95a438283,chessdotcom
1,e14d455ef2680fa21e1299985922a77b8cd28eaf,lichess


In [None]:
df_platform.createOrReplaceTempView("platform")

# Load `games` fact table

In [9]:
df_games = spark.read.parquet(output_data + "/fact/game")

In [10]:
df_games.createOrReplaceTempView("games")

In [63]:
df_games.limit(10).toPandas()

Unnamed: 0,game_id,game_end_time,game_end_date,white_id,white_rating,black_id,black_rating,winner,termination,opening_id,moves,platform_id,year,time_class_id
0,01147becfd556b9948605053d3693e37569d245f,2020-09-06 06:26:06,2020-09-06,bb3337775b465f203e7702d359f3cced75b036fb,2571,937839599d48147563fef669aa58c04e3ccce62d,2576,white,Evandro_Barbosa won by resignation,f5943448862a048431d01367524737241307283b,1. e4 {[%clk 0:02:59.8]} 1... d6 {[%clk 0:02:5...,224fccb5995aa2f6fd208d59570e06a95a438283,2020,6d199aca996a9a8bff542e2bc10e3f0edc62cd07
1,014602a19a14da661e5b0c1b451e5508dd33e58b,2020-05-03 09:43:41,2020-05-03,612a26f6f75868ead6fff98899bddacfae18aba7,2519,df8ccc9959d877b3e49830342781d9ef1d0cdc7e,2518,,Game drawn by repetition,4d64483ff0d318f1060962be9aff0779179c7885,1. d4 {[%clk 0:02:59.9]} 1... Nf6 {[%clk 0:02:...,224fccb5995aa2f6fd208d59570e06a95a438283,2020,6d199aca996a9a8bff542e2bc10e3f0edc62cd07
2,01561a638e95ca0a52031a4101426ad2c5964877,2020-03-04 02:26:00,2020-03-04,47c906f9068fb2df4eb91dca14fa2da23d2c1128,2508,758dd33dce8c9846e69de910d01a709153f866cf,2429,white,Akshayraj_Kore won by checkmate,86fd80e0196b4168c0b25006b668a932833a18c3,1. d4 {[%clk 0:03:01.9]} 1... Nf6 {[%clk 0:03:...,224fccb5995aa2f6fd208d59570e06a95a438283,2020,6d199aca996a9a8bff542e2bc10e3f0edc62cd07
3,016146eec71a4105a0a5202b5b648365ffe91f19,2020-12-12 05:36:59,2020-12-12,4316b167bca6aaea2b407b31427793db07404188,2309,8d4758c815beec66e7a663d10bdccfc2bd500c84,2255,white,,,,224fccb5995aa2f6fd208d59570e06a95a438283,2020,6d199aca996a9a8bff542e2bc10e3f0edc62cd07
4,01bb216a0a190a6634c5fcd773aea3aba6fa7fd4,2020-11-16 12:38:16,2020-11-16,08268f2c529d874f0e3b4a698e170ce3fa22ce16,2919,324b6adc52d1ded85ae8e6bb1ab81151277e593b,2806,,Game drawn by insufficient material,01e332f824ba36196b0ec09d3e1359d4def5463a,1. d4 {[%clk 0:02:59.9]} 1... Nf6 {[%clk 0:02:...,224fccb5995aa2f6fd208d59570e06a95a438283,2020,6d199aca996a9a8bff542e2bc10e3f0edc62cd07
5,01d35a3dde171d535c1b2d94d0b04868822e67e7,2020-04-28 03:47:53,2020-04-28,99fa53bdcf3207d12968bbf3e96af07b43f86692,2705,1e86421a0b47fad9fe5b3d6050005badc932d5d0,2612,black,chessintuit won by resignation,0aa7785ec8e2b0e1be7355aa25aa13e16e9b50ea,1. c4 {[%clk 0:02:59.9]} 1... e5 {[%clk 0:02:5...,224fccb5995aa2f6fd208d59570e06a95a438283,2020,6d199aca996a9a8bff542e2bc10e3f0edc62cd07
6,01d4ca16de68d4904cfa83a7cb01d507c5c9d787,2020-05-08 01:58:09,2020-05-08,103a98c4cc9a3f4e75f27dfe3ced547faf4fa797,2568,e5ae674fd91eb4a6e7588a723fb7f934dc4a7ebb,2535,white,Problematique_99 won by checkmate,69071d12988709f75b2a072a1e067dc0bf78c185,1. e4 {[%clk 0:02:59.9]} 1... c5 {[%clk 0:02:5...,224fccb5995aa2f6fd208d59570e06a95a438283,2020,6d199aca996a9a8bff542e2bc10e3f0edc62cd07
7,01df210bb83bd22b3d66031b6eb73a104ca85963,2020-04-02 04:17:06,2020-04-02,397547b94c443978ad4dbcdfb2e984fce99e00e7,2795,a3070278a0ff5df85573ab6d652f29dad53b0b4d,2629,white,Alexander_Donchenko won on time,03ebe9dd96faf77533cbe1239f82d5940c1c98c2,1. e4 {[%clk 0:02:59.9]} 1... d6 {[%clk 0:02:5...,224fccb5995aa2f6fd208d59570e06a95a438283,2020,6d199aca996a9a8bff542e2bc10e3f0edc62cd07
8,0411ad1f1f734050a31f9395ff1d3b86bb7e6193,2020-02-16 02:04:31,2020-02-16,5d59b88a729f5b20615bd75e80c646b7d102e89c,2991,030cfb9b8f2f7509a7e0c4f2b8aa21fe03438670,2950,black,Nowaybacksasha won on time,aeaa9c54dc1dfa5a9c61aa1d96c39b2e54db69c4,1. d4 {[%clk 0:02:59.3]} 1... Nf6 {[%clk 0:02:...,224fccb5995aa2f6fd208d59570e06a95a438283,2020,6d199aca996a9a8bff542e2bc10e3f0edc62cd07
9,0452826aa67e6b7039f7f15d429ffedb0a45a11b,2020-10-28 12:22:31,2020-10-28,bb50cfa6c061744032d4d2332946c25ac964feda,2770,ac8cfa3ee8f7976596c44b59c80b3b6392754080,2606,white,A-Fier won by resignation,acedb53328743cc863cc871362b0ed779a961da7,1. d4 {[%clk 0:03:00.9]} 1... Nf6 {[%clk 0:02:...,224fccb5995aa2f6fd208d59570e06a95a438283,2020,6d199aca996a9a8bff542e2bc10e3f0edc62cd07


# Run Analytics

### Most popular openings

In [76]:
most_popular_openings = spark.sql("""

    select
        games.opening_id,
        opening.opening,
        count(*) as game_count
    from games
    
    left join opening on games.opening_id = opening.opening_id
    
    group by games.opening_id, opening
    
    order by game_count desc

""")

In [77]:
most_popular_openings.limit(10).toPandas()

Unnamed: 0,opening_id,opening,game_count
0,,,240
1,815e009981ef6db3e5668790f3208fe03f6da802,Queens Pawn Opening Zukertort Variation,66
2,4966d04e85a1417fcbffa7c6ee7b6102c29ce8b5,Indian Game,65
3,bb4d72d80d54f9690ff399572990aa4cc82753c3,English Opening Anglo Indian Kings Knight Vari...,65
4,2e3798c3a0b2cf626920352064a88f6a43e9ac96,Reti Opening Kingside Fianchetto Variation,51
5,4d64483ff0d318f1060962be9aff0779179c7885,Queens Gambit Declined 3.Nf3 Nf6,47
6,8116eace38eeef8db24626be42b3d4ef1fb7c3e6,Indian Game Knights Variation,43
7,cf29c0886d77da5349732659848f36b7bac70353,Nimzowitsch Larsen Attack,39
8,30c36e0b5b97e18e04f1856926f76b53fed07195,Queens Pawn Opening Chigorin Variation 2...Nf6,32
9,c0f673b9126e38247ac8f6febea3229944f5c8fa,Nimzowitsch Larsen Attack Classical Variation ...,31


### Players with most wins

In [None]:
players_most_wins = spark.sql("""

    with white_wins as (
    
    select
        player.username,
        sum(case when winner = "white" then 1 else 0 end) as white_wins,
        count(*) as white_games
    from games
    
    left join player on games.white_id = player.id
    
    group by username

    ),
    
    black_wins as(
    
    select
        player.username,
        sum(case when winner = "black" then 1 else 0 end) as black_wins,
        count(*) as black_games
    from games
        
    left join player on games.black_id = player.id
    group by username

    )
    
    select
        white_wins.username,
        white_wins.white_wins,
        white_wins.white_games,
        black_wins.black_wins,
        black_wins.black_games
    from white_wins
    
    inner join black_wins on white_wins.username = black_wins.username
    
    group by white_wins.username, white_wins, white_games, black_wins, black_games
    
    order by white_wins desc
    

""")

In [15]:
players_most_wins.limit(10).toPandas()

Unnamed: 0,username,white_wins,white_games,black_wins,black_games
0,DanielNaroditsky,101,167,81,132
1,abykhovsky,78,158,70,140
2,erichansen,74,124,60,120
3,Byniolus,70,102,76,158
4,nihalsarin2004,56,78,53,76
5,DryCounty,52,88,48,98
6,Firouzja2003,50,85,46,88
7,Bigfish1995,50,102,52,96
8,Duhless,44,58,34,52
9,BrandonJacobson,42,70,30,62
