In [None]:
import os
from dotenv import load_dotenv
import re
import math
import sys
import csv
import numpy as np
import pandas as pd
from decimal import Decimal, ROUND_HALF_UP
import time
import os.path

import berserk
from requests_oauthlib import OAuth2Session

load_dotenv(r"../.env")
api_key = os.environ.get("API_KEY")

session = OAuth2Session(api_key)
client = berserk.Client(session)

In [None]:
def clock_to_secs(time):
    return 3600*int(time[0:1]) + 60 * int(time[2:4]) + int(time[5:])

def parse_moves(raw_moves):
    #moves = list(re.findall('(?<=\ )[^#][A-z0-9+#\=\-\?\!]+(?=\ )', raw_moves))
    evals = list(re.findall('(?<=\[\%eval\ )[^]]+(?=\])', raw_moves))
    clocks = list(re.findall('(?<=\[\%clk\ )[^]]+(?=\])', raw_moves))
    clocks = [clock_to_secs(time) for time in clocks]
    return {'clocks':clocks, 'evals':evals}

def parse_line(string):
    if string.startswith('['):
        attribute = re.findall('(?<=\[)[A-z0-9]+', string)
        text = re.findall('(?<=\").+(?=\")', string)
        return {attribute[0]: text[0]}
    elif string.startswith('1'):
        return parse_moves(string)
    
def add_ratings(game_dict, perfs, ret):
    date = game_dict['Date'].split('.')
    try:
        white_rh = client.users.get_rating_history(game_dict['White'])
        time.sleep(.25)
        black_rh = client.users.get_rating_history(game_dict['Black'])
        time.sleep(.25)
    except:
        pass

    try:
        for rh, player in zip([white_rh, black_rh], ['white', 'black']):
            for index, rh_perf in enumerate([perf['name'] for perf in rh]):
                if rh_perf in perfs and len(rh[index]['points']) > 0:
                    ratings = list(reversed(rh[index]['points']))
                    final_rating = ratings[0]
                    for rating in ratings:
                        final_rating = rating
                        if rating.day < int(date[2]) or rating.month != int(date[1]):
                            break
                    ret[player + '_' + rh_perf + '_rating'] = final_rating.rating
                elif rh_perf in perfs and len(rh[index]['points']) == 0:
                    ret[player + '_' + rh_perf + '_rating'] = 0
    except:
        for player in ['white', 'black']:
            for perf in perfs:
                ret[player + '_' + perf + '_rating'] = None
    
def parse_game(game_dict):
    perfs = ['ultrabullet', 'bullet', 'blitz', 'rapid', 'classical', 'correspondence']
    ret = {}
    
    ret['perf'] = [perf for perf in perfs if perf in game_dict['Event']][0]
    
    ret['white_username'] = game_dict['White']
    ret['black_username'] = game_dict['Black']
    
    add_ratings(game_dict, perfs, ret)
    
    ret['white_rating'] = game_dict['WhiteElo']
    ret['black_rating'] = game_dict['BlackElo']
    
    times = game_dict['TimeControl'].split('+')
    if len(times) > 1:
        ret['clock_initial'] = times[0]
        ret['clock_increment'] = times[1]
    else:
        ret['clock_initial'] = None
        ret['clock_increment'] = None
    
    if '-' in game_dict['Result']:
        ret['winner'] = 1-eval(game_dict['Result'].split('-')[0])
    else:
        ret['winner'] = None
    
    ret['clocks'] = game_dict['clocks']
    ret['evals'] = game_dict['evals']
    
    ret['id'] = re.findall("[0-9A-z]+$", game_dict['Site'])[0]
    
    return ret
    
def add_dic(complete, current):
    if complete:
        for key in current:
            complete[key].append(current[key])
    else:
        for key in current:
            complete[key] = [current[key]]

In [None]:
complete = {}
keys = ['Event', 'Site', 'Date', 'Round', 'White', 'Black', 'Result', 'UTCDate', 'UTCTime', 'WhiteElo', 'BlackElo', 
        'WhiteRatingDiff', 'BlackRatingDiff', 'ECO', 'Opening', 'TimeControl', 'Termination', 'Clocks', 'Evals']
saved = False

with open(r"..\data\lichess-july.pgn") as f:
    nextLine = f.readline()
    
    current = {}
    current.update(parse_line(nextLine))
    
    while (nextLine != ''):
        nextLine = f.readline()
        parsed = parse_line(nextLine)
        if parsed != None:
            if list(parsed.keys())[0] != 'Event':
                current.update(parsed)
            else:
                for key in keys:
                    if key not in current:
                        current.update({key:None})
                if current['evals'] != None and len(current['evals']) > 0:
                    ret = parse_game(current)
                    add_dic(complete, ret)
                    saved = False
                    print(len(complete['winner']))
                current = {}
                current.update(parsed)
        if 'perf' in complete and len(complete['perf']) % 500 == 0 and saved == False:
            if os.path.isfile(r'..\data\data2.csv'):
                saved = True
                df = pd.DataFrame(complete)
                file = pd.read_csv(r'..\data\data2.csv')
                print(len(file['winner']))
                pd.concat((file, df), axis=0).to_csv(r'..\data\data2.csv', index=False)
                complete = {}
            else:
                saved = True
                df = pd.DataFrame(complete)
                df.to_csv(r'..\data\data2.csv', index=False)
                complete = {}