## Imports

In [1]:
import pandas as pd
import os

# Step 1: Dataset path from download
dataset_path = "/Users/husnainsyed/.cache/kagglehub/datasets/arevel/chess-games/versions/1"

# Find the CSV file
csv_file = next(f for f in os.listdir(dataset_path) if f.endswith(".csv"))
csv_path = os.path.join(dataset_path, csv_file)

# Load a memory-friendly subset (all columns)
df = pd.read_csv(
    csv_path,
    nrows=3000_000  
)




In [2]:
df.dtypes

Event               object
White               object
Black               object
Result              object
UTCDate             object
UTCTime             object
WhiteElo             int64
BlackElo             int64
WhiteRatingDiff    float64
BlackRatingDiff    float64
ECO                 object
Opening             object
TimeControl         object
Termination         object
AN                  object
dtype: object

### Checking for any duplicates

In [3]:
duplicates = df[df.duplicated()]
print(duplicates)


          Event     White      Black Result     UTCDate   UTCTime  WhiteElo  \
369577   Blitz   toinhoxd  Alajandro    0-1  2016.07.02  19:17:26      1482   

        BlackElo  WhiteRatingDiff  BlackRatingDiff  ECO            Opening  \
369577      1667             -5.0              5.0  C00  French Defense #2   

       TimeControl   Termination            AN  
369577       300+0  Time forfeit  1. e4 e6 0-1  


In [4]:
# Removing duplicates and checking to see if they have been removed
df = df.drop_duplicates()
duplicates = df[df.duplicated()]
print(duplicates)

Empty DataFrame
Columns: [Event, White, Black, Result, UTCDate, UTCTime, WhiteElo, BlackElo, WhiteRatingDiff, BlackRatingDiff, ECO, Opening, TimeControl, Termination, AN]
Index: []


### I noticed there were whitespaces before and after string values, removing these from all columns

In [5]:
# Only apply to columns with dtype 'object' (strings)
str_cols = df.select_dtypes(include=['object']).columns

# Strip whitespace for all of them
df[str_cols] = df[str_cols].apply(lambda x: x.str.strip())


In [6]:
df['Event'].unique()


array(['Classical', 'Blitz', 'Blitz tournament', 'Correspondence',
       'Classical tournament', 'Bullet tournament', 'Bullet'],
      dtype=object)

### So what is chess's most important format? Everyone that ESPN has spoken to on the sidelines of the Chennai Grand Masters has a unanimous verdict - classical. That's where every chess player wants to be world champion. (https://www.espn.co.uk/chess/story/_/id/45963789/which-chess-most-important-format-frequent-switching-raises-questions,Aug 13, 2025, 03:10 PM)

In [7]:
# Classical games are the standard format of chess and will be used as the reference for this analysis.
df_event = df[df['Event']=='Classical']

### As in the Analysis we are looking for opening that will help white win, the result should be towards white

- Result: Game Result (1-0 White wins) (0-1 Black wins)

In [8]:
df_event['Result'].value_counts()

Result
1-0        361261
0-1        334151
1/2-1/2     29693
*             200
Name: count, dtype: int64

In [9]:
white_df = df_event[df_event['Result'] == '1-0']
white_df.shape

(361261, 15)

In [10]:
white_df_means = white_df['WhiteElo'].mean()
white_df_means

np.float64(1722.2931398628693)

| Rating Range | Category |
|--------------|----------------------------------------------------|
| 2700+        | Informally termed as ‘Super Grandmasters’       |
| 2500-2700    | Most Grandmasters (GM)                           |
| 2400-2500    | Most International GMs (IMs) and some GMs       |
| 2300-2400    | Most FIDE Masters (FMs) and some IMs            |
| 2200-2300    | FIDE Candidate Masters (CMs), some National Masters (NMs) |
| 2000-2200    | Candidate Masters (CMs)                          |
| 1800-2000    | Class A, category 1                               |
| 1600-1800    | Class B, category 2                               |
| 1400-1600    | Class C, category 3                               |
| 1200-1400    | Class D, category 4                               |
| Below 1200   | Novices                                           |
(https://squareoffnow.com/blog/chess-ranking-system/?srsltid=AfmBOooSHqtiJMI7quZTvrIdvd08mv15nIz4a6OUrN2_jXXeyeuwIfhE, 2023)

### For this analysisi I will use class D as this is the most basic level of professional chess 

In [14]:
final_df = white_df[white_df['WhiteElo'] >= 1200]
final_df.reset_index(drop=True, inplace=True)
final_df.shape

(357367, 15)

In [15]:
final_df.head(10)

Unnamed: 0,Event,White,Black,Result,UTCDate,UTCTime,WhiteElo,BlackElo,WhiteRatingDiff,BlackRatingDiff,ECO,Opening,TimeControl,Termination,AN
0,Classical,eisaaaa,HAMID449,1-0,2016.06.30,22:00:01,1901,1896,11.0,-11.0,D10,Slav Defense,300+5,Time forfeit,1. d4 d5 2. c4 c6 3. e3 a6 4. Nf3 e5 5. cxd5 e...
1,Classical,fabikim,sereno,1-0,2016.06.30,22:00:02,1630,1500,7.0,-7.0,C41,Philidor Defense #3,420+5,Normal,1. e4 e5 2. Nf3 d6 3. Bc4 Nc6 4. d3 Na5 5. Na3...
2,Classical,dvs15lsa,xrap35,1-0,2016.06.30,22:00:03,1635,1572,9.0,-10.0,C54,"Italian Game: Classical Variation, Greco Gambi...",300+8,Normal,1. e4 e5 2. Bc4 Nc6 3. Nf3 Bc5 4. c3 Nf6 5. d4...
3,Classical,sarraff12,AL3103,1-0,2016.06.30,22:00:14,1751,1467,4.0,-4.0,C40,King's Pawn Game: Busch-Gass Gambit,900+0,Time forfeit,1. e4 e5 2. Nf3 Bc5 3. Nc3 d6 4. h3 a6 5. d3 f...
4,Classical,xxrobertoxx,mama29,1-0,2016.06.30,22:00:11,1577,1377,12.0,-6.0,B40,Sicilian Defense: Paulsen-Basman Defense,480+8,Normal,1. e4 { [%eval 0.23] } 1... e6 { [%eval 0.2] }...
5,Classical,daan1982,Veska,1-0,2016.06.30,22:00:20,1734,1500,4.0,-87.0,C20,Bishop's Opening: Boi Variation,120+10,Time forfeit,1. e4 e5 2. Bc4 Bc5 3. Nc3 Qf6 4. Qf3 Nh6 5. Q...
6,Classical,n00bi,nedmac101,1-0,2016.06.30,22:00:20,1537,1433,8.0,-8.0,B07,Pirc Defense #5,420+8,Time forfeit,1. e4 d6 2. d4 c6 3. d5 c5 4. c4 Nf6 5. f3 g6 ...
7,Classical,niubo,cnv,1-0,2016.06.30,22:00:22,1903,1951,14.0,-14.0,C22,Center Game: Berger Variation,120+12,Normal,1. e4 e5 2. d4 exd4 3. Qxd4 Nc6 4. Qe3 Nf6 5. ...
8,Classical,delavoro,ONeologista,1-0,2016.06.30,22:00:16,1621,1281,4.0,-6.0,C42,Russian Game: Three Knights Game,900+0,Normal,1. e4 { [%eval 0.31] } 1... Nf6 { [%eval 0.56]...
9,Classical,Nutsynuts,vilder,1-0,2016.06.30,22:00:23,1523,1531,12.0,-11.0,C00,French Defense: Normal Variation,600+0,Normal,1. e4 e6 2. d4 Qh4 3. e5 d6 4. Nf3 Qe4+ 5. Be3...
