In [79]:
import json
import os
from difflib import SequenceMatcher
import shutil

In [71]:
def nameSimilar(name_a, name_b):
    sim_ratio = SequenceMatcher(None, name_a, name_b).ratio()
    # print(sim_ratio)
    if sim_ratio >= 0.9:
        return True
    elif 0.8 < sim_ratio < 0.9:
        print(name_a, name_b)
        return True
    else:
        return False

In [2]:
circuit_files = os.listdir('./Ergast_Data_Circuit')
constructor_files = os.listdir('./Ergast_Data_Constructors')
driver_files = os.listdir('./Ergast_Data_Drivers')
race_control_files = os.listdir('./LiveTiming_Data')
tfeed_files = os.listdir('./TFeed_Data')

all_files = [circuit_files, constructor_files, driver_files,
             race_control_files, tfeed_files]

In [3]:
filenames = []

for file in circuit_files:
    if 'ipynb' not in file:
        filenames.append(file)

In [83]:
def getNumberOfLaps(fn):
    nr_laps = 0
    with open('./TFeed_Data/' + fn, 'r') as f:
        tfeed_data = json.load(f)
        for driver in tfeed_data:
            for lap in tfeed_data[driver]:
                nr_laps += 1
            break
    return nr_laps


def loadData(folder, fn):
    with open(folder + fn, 'r') as f:
        data = json.load(f)
    return data


def getAllData(df, fn):
    return {
        "stats": loadData(df[4], fn),
        "race_control": loadData(df[3], fn),
        "driver_info": loadData(df[2], fn),
        "team_info": loadData(df[1], fn),
        "race_info": loadData(df[0], fn)
    }


def getDriverId(tfeed_name, driver_info):
    last_name = tfeed_name.split('.')[1].lower()
    if ' ' in last_name:
        last_name = last_name.split(' ')[-1]
    driver_id = ''
    for driver in driver_info:
        dn = driver
        if '_' in dn:
            dn = dn.split('_')[-1]
        if last_name == dn:
            driver_id = driver
            break
        elif nameSimilar(last_name, dn):
            driver_id = driver
            break
    return driver_id

In [84]:
data_folders = ['./Ergast_Data_Circuit/',
                './Ergast_Data_Constructors/',
                './Ergast_Data_Drivers/',
                './LiveTiming_Data/',
                './TFeed_Data/']

for folder in os.listdir('./DataSet'):
    if 'ipynb' in folder: continue
    fn = folder + '.json'
    try:
        nr_laps = getNumberOfLaps(fn)
    except FileNotFoundError:
        shutil.rmtree('./DataSet/' + folder, ignore_errors=True)
        continue
    data = getAllData(data_folders, fn)
    for lap in range(1, nr_laps + 1):
        lap_data, lap_stats = {}, {}
        for driver in data["stats"]:
            driver_id = getDriverId(driver, data["driver_info"])
            if driver_id == '':
                break
            # print(driver_id)
            lap_stats[driver_id] = data["stats"][driver][str(lap)]
        lap_data["stats"] = lap_stats
        try:
            lap_data["rc_messages"] = data["race_control"][str(lap)]
        except KeyError:
            lap_data["rc_messages"] = []
        lap_data["driver_info"] = data["driver_info"]
        lap_data["team_info"] = data["team_info"]
        lap_data["race_info"] = data["race_info"]
        lap_fn = './DataSet/' + folder + '/lap' + f'{lap:02}' + '.json'
        # break
        with open(lap_fn, 'w') as f:
            json.dump(lap_data, f)
    if driver_id == '':
        shutil.rmtree('./DataSet/' + folder, ignore_errors=True)
        continue