In [76]:
#!/usr/bin/env python
# coding: utf-8

import requests
import json
import os
import urllib.request
import pandas as pd
import re
#import cx_Oracle
from datetime import datetime, timedelta

folder_path = os.path.abspath("./")
PGNfolder = folder_path + "/PGN"

#print(PGNfolder)

def get_PGN(player, pgn = False):
    
    if pgn == False:
        pgn = ''
        
    elif pgn != False:
        pgn = '/pgn'
    
    #Gets the pgn files from the chesscom api and saves it locally by month file
    pgn_archives = requests.get('https://api.chess.com/pub/player/'+player.lower()+'/games/archives')
    #garantees that the player will always be lowercase, in case the user writes it differently
    print("Solving %s online data..." % player)
    try:
        #print(json.loads(pgn_archives.content)["archives"])
        skip_refiller = True #this variable helps to check if I have to remove the last month and redownload it
        if not os.path.exists(PGNfolder):
            #check if the main /pgn folder exist, if not it creates
            os.makedirs(PGNfolder)
            
        for month_url in json.loads(pgn_archives.content)["archives"]:
            #goes through all the archives and check each month
            user_folder = PGNfolder + '/' + month_url.split("/")[-4]
            folderpath = user_folder + "/" + month_url.split("/")[-2] + "-" + month_url.split("/")[-1]

            if not os.path.exists(user_folder):
                #checks if the name folder existis, if not it will create
                os.makedirs(user_folder)
            
            if not os.path.exists(folderpath+".txt"):
                #check if the month file exists, if not it will create
                urllib.request.urlretrieve(month_url + pgn, folderpath+".txt")
                print("New folder found %s" % folderpath)
                skip_refiller = False #it's a completely new month, it won't redownload it
        
        if skip_refiller:
            os.remove(folderpath + ".txt")
            urllib.request.urlretrieve(month_url + pgn, folderpath+".txt")
            print("Refilling folder %s" % folderpath)
        
        print("%s data solved." % player)
        return(True)
    except KeyError: #if the given player doesn't have an archive nor exists, it'll return as player not found and boolean False
        print("player not found")
        return(False)
    
def extract_data(filepath, pgn = True):
    #Loads the PGN files from the local folders
    #print("Reading PGN")
    with open(filepath) as f:
        #print(f.readlines())
        if pgn == True:
            return(f.readlines())
        elif pgn == False
            data = []
            data1 = f.readlines()
            for i in range(len(json.loads(data1[0])['games'])):
                try:
                    data.append(json.loads(data1[0])['games'][i]['pgn'].split('\n'))
                    data[i][-1] = json.loads(data1[0])['games'][i]['time_class'] #=data[i][:-1]
                
                except:
                    pass
            return(data)

def data_delimiter(data):
    #Returns two lists: One with the beginnings and another with the endings of each game in a data list
    start = []
    end = []
    
    for i,j in enumerate(data):
        
        if j.startswith("[Event"):
            start.append(i)
            if i != 0:
                end.append(i - 2)

    end.append(len(data))

    return(start, end)

def PGNExtract(data):
    s = data.split(" ")
    
    game = "1."
    
    for i, j in enumerate(s):
        
        if j != "1.":
            if ("1-0" in j) or ("0-1" in j) or ("1/2-1/2" in j):
                j = j[:-1]
                game = game + " " + j

            elif ("{" not in j) and ("}" not in j) and ("..." not in j):
                game = game + " " + j            
            
    return(game)

def pieceMoveCounter(moves, playerColor, timeControl_is, id_):
    #This function counts how many time I moved each piece
    s = moves.split(" ")
    
    pieceMoves[id_] = {"Q" : 0,
                       "N" : 0,
                       "R" : 0,
                       "K" : 0,
                       "B" : 0,
                       "p" : 0,
                       "O_O" : 0,
                       "O_O_O" : 0,
                       "x" : 0,
                       "check" : 0}
    
    if playerColor == "White":
        somador = 1
        
    elif playerColor == "Black":
        somador = 2
    
    for i, move in enumerate(s):
        if "." in move:
            if ("1-0" in s[i + somador]) or ("1/2-1/2" in s[i+somador]) or ("0-1" in s[i+somador]):
                break
            else:
                #print(s[i+somador])
                if "+" in s[i+somador]:
                    pieceMoves[id_]["check"] += 1

                if "x" in s[i+somador]:
                    pieceMoves[id_]["x"] += 1

                if "Q" in s[i+somador]:
                    pieceMoves[id_]["Q"] += 1

                elif "N" in s[i+somador]:
                    pieceMoves[id_]["N"] += 1

                elif "R" in s[i+somador]:
                    pieceMoves[id_]["R"] += 1

                elif "K" in s[i+somador]:
                    pieceMoves[id_]["K"] += 1

                elif "B" in s[i+somador]:
                    pieceMoves[id_]["B"] += 1

                elif "O-O-O" in s[i+somador]:
                    pieceMoves[id_]["O_O_O"] += 1

                elif "O-O" in s[i+somador]:
                    pieceMoves[id_]["O_O"] += 1

                else:
                    pieceMoves[id_]["p"] += 1

def transform_data(data, player, start = False, end = False):
    #print(data)
    
    pattern = "\"(.*?)\"" #pattern for regular expression delimiting data between ""
    allGames = []
    if start == False: #if .txt has more than the pgn itself
        counter = data
        pgn_list_spot = -2
    else: #if the .txt has only the pgn
        counter = start
        pgn_list_spot = -1
        
    for i in range(0, len(counter)):
        
        inGame = []
        #game = data[start[i]].split('\n')
        if start == False: #if .txt has more than the pgn itself
            game = data[i] #game delimitation  
            print(game)
            print('---')
        else: #if the .txt has only the pgn
            game = data[start[i]:end[i]] #game delimitation  

        if game[10].startswith("[ECOUrl"):
            whitePlayer = re.search(pattern, game[4]).group(1)
            if whitePlayer == player: #se o player estiver de brancas
                inGame.append(whitePlayer) #append player
                inGame.append("White") #append player color
                inGame.append(re.search(pattern, game[5]).group(1)) #append black player

            else:
                inGame.append(re.search(pattern, game[5]).group(1)) #append black player
                inGame.append("Black") #append player color
                inGame.append(whitePlayer) #append opponent

            winner = re.search(pattern, game[16]).group(1) #get winner and winning reason
            winner = winner.split(" ")

            if winner[0] == player:
                inGame.append('Winner') #indicates that the player won
                inGame.append(winner[-1]) #winning reason

            elif winner[0] == 'Game':
                inGame.append('Draw') #indicates that the player drew
                inGame.append(winner[-1]) #winning reason

            else:
                inGame.append('Loser') #indicates that the player lost
                inGame.append(winner[-1]) #winning reason

            if whitePlayer == player: #se o player estiver de brancas
                inGame.append(re.search(pattern, game[13]).group(1)) #player ELO
                inGame.append(re.search(pattern, game[14]).group(1)) #opponent ELO

            else: #Se o jogador estiver de pretas
                inGame.append(re.search(pattern, game[14]).group(1)) #Player ELO
                inGame.append(re.search(pattern, game[13]).group(1)) #Opponent ELO

            inGame.append(re.search(pattern, game[15]).group(1)) #Defines time control
            #print(game)
            dateObject = datetime.strptime(re.search(pattern, game[2]).group(1), "%Y.%m.%d") #Defines date in UTC
            time_utc = re.search(pattern, game[12]).group(1) #get the time in UTC

            #Here converts time UTC to UTC-3, since I'm brazilian
            utc_dt = dateObject.strftime("%d/%m/%Y") + " " + time_utc
            utc_dt = datetime.strptime(utc_dt, "%d/%m/%Y %H:%M:%S")
            utc_br = utc_dt - timedelta(hours=3)

            inGame.append(utc_br.strftime("%d/%m/%Y")) #Set date as dd/mm/YYYY

            inGame.append(utc_br.strftime("%H:%M:%S")) #insert time into the list

            pattern2 = "\"https://www.chess.com/openings/(.*?)\"" #gets the link our of the equation
            substring = re.search(pattern2, game[10]).group(1) #same
            
            pattern2 = "(.*?)\."
            #substring2 = re.search(pattern, substring).group(1)
            try: #this will try to remove the ...nf3 stuff
                substring = re.search(pattern2, substring).group(1)
                substring = substring.replace("-1", "")
                substring = substring.replace("-2", "")
                substring = substring.replace("-3", "")
                substring = substring.replace("-4", "")
                substring = substring.replace("-5", "")
                substring = substring.replace("-6", "")
            except: #if there's no stuff like this, continues normally
                substring = substring.replace("-1", "")
                substring = substring.replace("-2", "")
                substring = substring.replace("-3", "")
                substring = substring.replace("-4", "")
                substring = substring.replace("-5", "")
                substring = substring.replace("-6", "")
                
                        
            if "Opening" in substring:
                pattern2 = "(.*?)-Opening" #gets the link our of the equation
                substring = re.search(pattern2, substring).group(1) + "-Opening"#same
                #print(substring)
                
            if "Defense" in substring:
                pattern2 = "(.*?)-Defense" #gets the link our of the equation
                substring = re.search(pattern2, substring).group(1) + "-Defense"#same
                #print(substring)
                
            if "Game" in substring:
                pattern2 = "(.*?)-Game" #gets the link our of the equation
                substring = re.search(pattern2, substring).group(1) + "-Game"#same
                #print(substring)
            
            if "Attack" in substring:
                pattern2 = "(.*?)-Attack" #gets the link our of the equation
                substring = re.search(pattern2, substring).group(1) + "-Attack"#same
                #print(substring)
            
            #print(substring)
            inGame.append(substring)

            gamePGN = PGNExtract(game[pgn_list_spot])

            inGame.append(gamePGN) #pgn transformed into a string
            
            try: #the id from the game link is the id in the database
                pattern2 = "\"https://www.chess.com/game/live/(.*?)\"" #gets the link our of the equation
                inGame.append(re.search(pattern2, game[20]).group(1)) #same
            except:
                pattern2 = "\"https://www.chess.com/game/daily/(.*?)\"" #gets the link our of the equation
                inGame.append(re.search(pattern2, game[20]).group(1)) #same
            
            
            if start == False:
                inGame.append(game[-1])
                
            else:
                inGame.append("-")
                    
            #Here it goes to the function to count how many times I moved each piece
            pieceMoveCounter(gamePGN, inGame[1], inGame[-6], inGame[-2])

            #print("---")
            #print(inGame)
            allGames.append(inGame)

            
    return(allGames)
     
    
#Get some information to print later by a given dataframe
def get_print_info(dataFrame):
    
    #Total games played
    total_games = dataFrame.shape[0]
    
    #How many games tha player won
    games_won = dataFrame[dataFrame.result == 'Winner'].shape[0]
    #How many games tha player won
    games_lost = dataFrame[dataFrame.result == 'Loser'].shape[0]
    #How many games tha player won
    games_drew = dataFrame[dataFrame.result == 'Draw'].shape[0]
    
    return(games_won, games_lost, games_drew, total_games)

#Returns a percentage. To recude verbosity
def percent(numerator, denominator):
    return(round(((numerator / denominator)*100), 2))

#returns a dataframe with only rows where the player used a specific color and the most played opening
def get_color_opening(df_in, color):
    df_Games = df_in[df_in.playerColor == color] #Gets the chosen color
    df_Games = df_Games[df_Games.opening == df_Games.opening.mode()[0]] #Returns the most played opening in that color
    return(df_Games)

#Print some data from a given dataframe
def color_analysis_print(df_color):
    #Gets the player color
    color = df_color.playerColor.iloc[0]
    #get's some information to print
    games_won, games_lost, games_drew, total_games = get_print_info(df_color)

    percent_win = percent(games_won, total_games)
    percent_draw = percent(games_drew, total_games)
    percent_lost = percent(games_lost, total_games)
    #Print the information
    print("    %s: %s, with %s%% winrate" % (color, df_color.opening.mode()[0], percent_win))
    
        
def extract(player, pgn = False):
    dfColumns = ["player", "playerColor", "opponent", "result", "winningReason", "playerElo", "oponentElo", "timeControl", 'date', 'time', 'opening', 'pgn', 'id', 'timeClass']
    global pieceMoves
    pieceMoves = {}
    playerExists = get_PGN(player, pgn = pgn) #tries to download player PGN data
    if playerExists: #If the player exists, then:        

        df = pd.DataFrame(columns=dfColumns)
        #pieceMoves_df = pd.DataFrame(columns=pieceMovesColumns)
        with os.scandir(PGNfolder + "/" + player.lower()) as folders: #goes through all the player month files
            print("Creating dataframe")
            for file in folders:
                #print(file)

                data = extract_data(file, pgn = pgn) #load each month file into the memory

                if pgn == True:
                    start, end = data_delimiter(data) #defines the limits of each data part
                    allGames = transform_data(data, player, start, end)
                #print(data[start[k]:end[k]])
                else:
                    allGames = transform_data(data, player)#, start, end)
                
                #break
                df2 = pd.DataFrame(data=allGames, columns=dfColumns)
                df = df.append(df2, ignore_index=True)
                
                if pgn == True:
                    df.drop(columns=['timeClass'], inplace=True)
                
    else: #If the player does not exist, it stops
        print("There's no %s data on chess.com" % player)

    print("Done")   
    return(df)

def main():
    print('teste')

if __name__ == "__main__":
    print("-")
    #main("GMKrikor")


-


In [None]:

with os.scandir('PGN' + "/" + 'hallsand'.lower()) as folders: #goes through all the player month files
    print("Creating dataframe")
    for file in folders:

        data1 = extract_data(file) #load each month file into the memory
        data = []
        for i in range(len(json.loads(data1[0])['games'])):
            data.append(json.loads(data1[0])['games'][i]['pgn'])

        print(data)

In [27]:
json.loads(data1[0])['games'][0]['time_class']

'daily'

In [79]:
df = extract('Hallsand', pgn = False)

Solving Hallsand online data...
Refilling folder /home/julio/Desktop/Data_Projects/chess/chess-analysis/PGN/hallsand/2021-11
Hallsand data solved.
Creating dataframe


JSONDecodeError: Expecting value: line 1 column 2 (char 1)

In [74]:
df

Unnamed: 0,player,playerColor,opponent,result,winningReason,playerElo,oponentElo,timeControl,date,time,opening,pgn,id
0,Hallsand,White,Osvaldo937,Winner,time,1119,1000,1/86400,27/03/2020,00:29:33,Four-Knights-Game,1. e4 e5 2. Nf3 Nc6 3. Nc3 Nf6 4. Bc4 1-0,256178732
1,Hallsand,White,ClimatBR,Winner,resignation,1119,681,1/86400,14/03/2020,21:29:52,Kings-Pawn-Opening,1. e4 e5 2. Nf3 d5 3. exd5 Nf6 4. Nc3 e4 5. Nd...,254576218
2,Hallsand,Black,Osvaldo937,Winner,time,1081,1000,1/86400,24/03/2020,20:45:41,Kings-Pawn-Opening,1. e4 e5 0-1,255870284
3,Hallsand,Black,Danielhost19,Winner,checkmate,1081,800,1/86400,19/03/2020,10:29:07,Queens-Pawn-Opening,1. d4 d5 2. e4 dxe4 3. Bb5+ c6 4. Ba4 Nf6 5. c...,255102942
4,Hallsand,White,Osvaldo937,Loser,time,1081,1000,1/86400,21/03/2020,14:34:25,Pirc-Defense,1. e4 d6 2. d4 c5 3. Nf3 Nf6 4. Nc3 Bg4 0-1,255393656
...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,Hallsand,White,Danielhost19,Winner,checkmate,1008,800,1/86400,19/02/2020,22:21:57,Center-Game,1. e4 e5 2. d4 d5 3. Nc3 Nf6 4. Nf3 exd4 5. Nx...,251985654
516,Hallsand,White,ClimatBR,Winner,time,1008,792,1/86400,15/02/2020,10:22:05,Scandinavian-Defense,1. e4 d5 2. Nc3 dxe4 3. Nxe4 f5 4. Nc3 1-0,251467584
517,Hallsand,Black,ClimatBR,Winner,resignation,800,1000,1/86400,29/01/2020,23:31:14,Kings-Fianchetto-Opening,1. g3 d5 2. e3 Nc6 3. c3 Nf6 4. Qc2 e5 5. Nf3 ...,249707158
518,Hallsand,Black,francis90706063,Winner,checkmate,606,601,600,27/02/2020,21:36:08,Kings-Pawn-Opening,1. e4 e5 2. Qf3 Nc6 3. Bc4 Nf6 4. h4 Nd4 5. Qd...,4542171589


In [78]:
pieceMoves

{'256178732': {'Q': 0,
  'N': 2,
  'R': 0,
  'K': 0,
  'B': 1,
  'p': 1,
  'O_O': 0,
  'O_O_O': 0,
  'x': 0,
  'check': 0},
 '254576218': {'Q': 3,
  'N': 9,
  'R': 5,
  'K': 0,
  'B': 8,
  'p': 8,
  'O_O': 1,
  'O_O_O': 0,
  'x': 11,
  'check': 3},
 '255870284': {'Q': 0,
  'N': 0,
  'R': 0,
  'K': 0,
  'B': 0,
  'p': 1,
  'O_O': 0,
  'O_O_O': 0,
  'x': 0,
  'check': 0},
 '255102942': {'Q': 1,
  'N': 4,
  'R': 4,
  'K': 0,
  'B': 5,
  'p': 6,
  'O_O': 1,
  'O_O_O': 0,
  'x': 6,
  'check': 5},
 '255393656': {'Q': 0,
  'N': 2,
  'R': 0,
  'K': 0,
  'B': 0,
  'p': 2,
  'O_O': 0,
  'O_O_O': 0,
  'x': 0,
  'check': 0},
 '253102534': {'Q': 0,
  'N': 2,
  'R': 0,
  'K': 0,
  'B': 0,
  'p': 1,
  'O_O': 0,
  'O_O_O': 0,
  'x': 0,
  'check': 0},
 '4448191402': {'Q': 3,
  'N': 9,
  'R': 2,
  'K': 1,
  'B': 5,
  'p': 6,
  'O_O': 0,
  'O_O_O': 1,
  'x': 9,
  'check': 6},
 '4448139568': {'Q': 3,
  'N': 4,
  'R': 8,
  'K': 0,
  'B': 4,
  'p': 7,
  'O_O': 0,
  'O_O_O': 1,
  'x': 9,
  'check': 4},
 '444

In [None]:
data[]

In [None]:
play.df

In [None]:
timeFormat = data[0]

In [None]:
data[0].split('\n')