# Packages and Helper Functions

In [272]:
import warnings
warnings.filterwarnings("ignore")
import os

import pandas as pd
import numpy as np
import polars as pl
import scipy.stats as stats
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt
import math

import requests
import re
import pdfplumber

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.float_format', lambda x: "%.4f" % x)
# pd.options.plotting.backend = "plotly"

plt.style.use('ggplot')
sns.set_style('darkgrid')

# Reading in Data

## Pulling PDFs

In [273]:
isu_year = 2122
isu_event = "owg2022"
dir_name = f"./{isu_event}"
isu_url = "https://results.isu.org/results/season2122/owg2022/"
OUTPUT_CSV = dir_name + f"/{isu_event}.csv"

# Get the HTML content of the page
response = requests.get(isu_url)
html_content = response.text

# find all pdf extension files
regex_pattern = r'href=([^\s>]+\.pdf)'

# Find all matches
matches = re.findall(regex_pattern, html_content, re.IGNORECASE)

print(matches)
print(len(matches))

['FSKMSINGLES-----------QUAL000100--_JudgesDetailsperSkater.pdf', 'FSKMSINGLES-----------FNL-000100--_JudgesDetailsperSkater.pdf', 'FSKWSINGLES-----------QUAL000100--_JudgesDetailsperSkater.pdf', 'FSKWSINGLES-----------FNL-000100--_JudgesDetailsperSkater.pdf', 'FSKXPAIRS-------------QUAL000100--_JudgesDetailsperSkater.pdf', 'FSKXPAIRS-------------FNL-000100--_JudgesDetailsperSkater.pdf', 'FSKXICEDANCE----------QUAL000100--_JudgesDetailsperSkater.pdf', 'FSKXICEDANCE----------FNL-000100--_JudgesDetailsperSkater.pdf', 'FSKXTEAM--------------------------_EntryListbyEvent.pdf', 'FSKXTEAM--------------QUAL0001MN--_JudgesDetailsperSkater.pdf', 'FSKXTEAM--------------FNL-0001MN--_JudgesDetailsperSkater.pdf', 'FSKXTEAM--------------QUAL0002LD--_JudgesDetailsperSkater.pdf', 'FSKXTEAM--------------FNL-0002LD--_JudgesDetailsperSkater.pdf', 'FSKXTEAM--------------QUAL0003PR--_JudgesDetailsperSkater.pdf', 'FSKXTEAM--------------FNL-0003PR--_JudgesDetailsperSkater.pdf', 'FSKXTEAM--------------QUAL000

In [274]:
# find all score sheets in the pdf files
score_sheet_file_names = [m for m in matches if "judge" in m.lower()]
score_sheet_urls = [isu_url + m for m in score_sheet_file_names]
print(score_sheet_urls)
print(len(score_sheet_urls))

['https://results.isu.org/results/season2122/owg2022/FSKMSINGLES-----------QUAL000100--_JudgesDetailsperSkater.pdf', 'https://results.isu.org/results/season2122/owg2022/FSKMSINGLES-----------FNL-000100--_JudgesDetailsperSkater.pdf', 'https://results.isu.org/results/season2122/owg2022/FSKWSINGLES-----------QUAL000100--_JudgesDetailsperSkater.pdf', 'https://results.isu.org/results/season2122/owg2022/FSKWSINGLES-----------FNL-000100--_JudgesDetailsperSkater.pdf', 'https://results.isu.org/results/season2122/owg2022/FSKXPAIRS-------------QUAL000100--_JudgesDetailsperSkater.pdf', 'https://results.isu.org/results/season2122/owg2022/FSKXPAIRS-------------FNL-000100--_JudgesDetailsperSkater.pdf', 'https://results.isu.org/results/season2122/owg2022/FSKXICEDANCE----------QUAL000100--_JudgesDetailsperSkater.pdf', 'https://results.isu.org/results/season2122/owg2022/FSKXICEDANCE----------FNL-000100--_JudgesDetailsperSkater.pdf', 'https://results.isu.org/results/season2122/owg2022/FSKXTEAM-----------

In [275]:
# create data store
# if dir_name exists, skip this
if os.path.isdir(dir_name):
    print(f"data directory already exists: {dir_name}")
else:
    print(f"making data directory: {dir_name}")
    os.makedirs(dir_name, exist_ok=True)

    for url in score_sheet_urls:
        response = requests.get(url)
        filename = url.split('/')[-1]
        filepath = os.path.join(dir_name, filename)
        with open(filepath, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {filename}")

data directory already exists: ./owg2022


## Regex Parsing of PDFs

Below is testing on 1 file. For complete pipeline, go to Applying Data Pipeline section

In [276]:
PDF_PATH = dir_name + "/FSKMSINGLES-----------QUAL000100--_JudgesDetailsperSkater.pdf"

# use pdfplumber to get the complete text string of score pdf
pages_text = []
with pdfplumber.open(PDF_PATH) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if text:
            pages_text.append(text)

full_text = "\n".join(pages_text)

# regex pattern for finding each skater 
skater_header_pattern = re.compile(
    r"""
    ^(\d+)\s+                  # 1 rank
    (.+?)\s+                   # 2 name
    ([A-Z]{3})\s+              # 3 NOC code
    (\d+)\s+                   # 4 starting num
    (\d+\.\d{2})\s+            # 5 total segment score
    (\d+\.\d{2})\s+            # 6 total element score
    (\d+\.\d{2})\s+            # 7 total program score
    (-?\d+\.\d{2})$            # 8 total deductions
    """,
    re.VERBOSE | re.MULTILINE
)

# regex pattern for finding each skater's element
element_pattern = re.compile(
    r"""
    ^\s*(\d+)\s+                 # 1 element number
    ([A-Za-z0-9+!*<>q]+)\s+      # 2 element code
    (?:(\S+)\s+)?                # 3 optional info column (x, q, !, etc.)
    ([\d.]+)\s+                  # 4 base value
    (?:\b(x)\b\s+)?              # 5 optional extra points column (x)
    ([\-\d.]+)\s+                # 6 GOE
    ((?:(?:-?\d+)|-)(?:\s+(?:(?:-?\d+)|-)){8}\s+)  # 7 judges scores 
    ([\d.]+)$                    # 8 final score
    """,
    re.VERBOSE | re.MULTILINE
)

# regex pattern for finding each skater's program component
program_components_pattern = re.compile(
    r"""
    ^(Skating\s+Skills|Transitions|Performance|Composition|Interpretation\s+of\s+the\s+Music)\s+  # 1 component
    (\d+\.\d{2})\s+                     # 2 factor
    ((?:\d+\.\d{2}\s+){9})              # 3 judge scores
    (\d+\.\d{2})$                       # 4 final score
    """,
    re.VERBOSE | re.MULTILINE
)


In [277]:
# find all skater header matches
matches = list(skater_header_pattern.finditer(full_text))

# for each skater header match, get the complete text between next match
skater_blocks = []
for i, m in enumerate(matches):
    start = m.start()
    end = matches[i + 1].start() if i + 1 < len(matches) else len(full_text)
    skater_blocks.append((m, full_text[start:end]))

# create elements df and program components df row by row
elements_rows = []
program_rows = []

for header, block in skater_blocks:
    # pull skater info
    rank = int(header.group(1))
    name = header.group(2).title()
    noc = header.group(3)
    starting_number = int(header.group(4))
    tss = float(header.group(5))
    tes = float(header.group(6))
    tpcs = float(header.group(7))
    deductions = float(header.group(8))

    for m in element_pattern.finditer(block):
        row = {
            "rank": rank,
            "name": name,
            "noc": noc,
            "starting_number": starting_number,
            "tss": tss,
            "tes": tes,
            "tpcs": tpcs,
            "deductions": deductions,
            "element_no": int(m.group(1)),
            "element": m.group(2),
            "info": m.group(3),
            "base_value": float(m.group(4)),
            "extra_points": 1 if m.group(5) else 0,
            "goe": float(m.group(6)),
            "final_score": float(m.group(8)),
        }

        judges = m.group(7).split()
        for i, j in enumerate(judges):
            try:
                row[f"J{i+1}"] = int(j)
            except ValueError:
                row[f"J{i+1}"] = 0
        
        elements_rows.append(row)
    
    for m in program_components_pattern.finditer(block):
        row = {
            "rank": rank,
            "name": name,
            "noc": noc,
            "starting_number": starting_number,
            "tss": tss,
            "tes": tes,
            "tpcs": tpcs,
            "deductions": deductions,
            "program_component": m.group(1),
            "factor": float(m.group(2)),
            "final_score": float(m.group(4)),
        }

        judges = m.group(3).split()
        for i, j in enumerate(judges):
            try:
                row[f"J{i+1}"] = float(j)
            except ValueError:
                row[f"J{i+1}"] = 0
        
        program_rows.append(row)

element_df = pd.DataFrame(elements_rows)
element_df.sort_values(["rank", "element_no"], inplace=True)

program_df = pd.DataFrame(program_rows)
program_df.sort_values(["rank", "program_component"], inplace=True)


In [278]:
print(full_text)

Capital Indoor Stadium Figure Skating
首都体育馆 花样滑冰 / Patinage artistique
Palais omnisports de la capitale Men Single Skating
男子单人滑 / Patinage individuel hommes
TUE 8 FEB 2022 Short Program
短节目 / Programme court
Judges Details per Skater
裁判员对每位运动员的详细分数 / Notation détaillée des juges par patineur
Total Total Total Program
NOC Starting Total
Rank Name Segment Element Component Score
Code Number Deductions
Score Score (factored)
1 CHEN Nathan USA 28 113.97 65.98 47.99 0.00
# Executed Elements
FSKMSINGLES-----------QUAL000100--_77B v1.0 Report Created TUE 8 FEB 2022 13:33
ofnI Base Scores of GOE J1 J2 J3 J4 J5 J6 J7 J8 J9 Ref.
Value Panel
1 4F 11.00 4.40 4 4 5 4 4 4 4 4 4 15.40
2 3A 8.00 2.29 3 3 3 3 4 2 2 2 4 10.29
3 CCSp4 3.20 1.05 4 4 3 4 3 3 3 3 3 4.25
4 4Lz+3T 17.27 x 3.94 4 4 4 3 4 2 2 3 4 21.21
5 StSq4 3.90 1.95 5 5 5 5 5 5 4 5 5 5.85
6 FSSp4 3.00 1.03 3 4 4 3 4 3 3 3 4 4.03
7 CCoSp4 3.50 1.45 4 4 4 5 4 4 4 5 4 4.95
49.87 65.98
Program Components Factor
Skating Skills 1.00 9.75 9.25 9.

### Sense checking element df

In [279]:
print(element_df.shape)

(203, 24)


In [280]:
# in the short, expect each person to do 7 elements
element_df["name"].value_counts()

name
Chen Nathan             7
Vasiljevs Deniss        7
Selevko Aleksandr       7
Lee Sihyeong            7
Bychenko Alexei         7
Brezina Michal          7
Britschgi Lukas         7
Mozalev Andrei          7
Shmuratko Ivan          7
Milyukov Konstantin     7
Majorov Nikolaj         7
Carrillo Donovan        7
Litvintsev Vladimir     7
Kerry Brendan           7
Kondratiuk Mark         7
Kagiyama Yuma           7
Siao Him Fa Adam        7
Rizzo Matteo            7
Grassl Daniel           7
Jin Boyang              7
Aymoz Kevin             7
Messing Keegan          7
Hanyu Yuzuru            7
Semenenko Evgeni        7
Brown Jason             7
Kvitelashvili Morisi    7
Cha Junhwan             7
Uno Shoma               7
Sadovsky Roman          7
Name: count, dtype: int64

In [281]:
# expect 29 skaters in the short
len(element_df["name"].unique())

29

In [282]:
element_df.head()

Unnamed: 0,rank,name,noc,starting_number,tss,tes,tpcs,deductions,element_no,element,info,base_value,extra_points,goe,final_score,J1,J2,J3,J4,J5,J6,J7,J8,J9
0,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,1,4F,,11.0,0,4.4,15.4,4,4,5,4,4,4,4,4,4
1,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,2,3A,,8.0,0,2.29,10.29,3,3,3,3,4,2,2,2,4
2,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,3,CCSp4,,3.2,0,1.05,4.25,4,4,3,4,3,3,3,3,3
3,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,4,4Lz+3T,,17.27,1,3.94,21.21,4,4,4,3,4,2,2,3,4
4,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,5,StSq4,,3.9,0,1.95,5.85,5,5,5,5,5,5,4,5,5


### Sense checking program df

In [283]:
print(program_df.shape)

(145, 20)


In [284]:
# expect 5 program components per skater
program_df["name"].value_counts()

name
Chen Nathan             5
Vasiljevs Deniss        5
Selevko Aleksandr       5
Lee Sihyeong            5
Bychenko Alexei         5
Brezina Michal          5
Britschgi Lukas         5
Mozalev Andrei          5
Shmuratko Ivan          5
Milyukov Konstantin     5
Majorov Nikolaj         5
Carrillo Donovan        5
Litvintsev Vladimir     5
Kerry Brendan           5
Kondratiuk Mark         5
Kagiyama Yuma           5
Siao Him Fa Adam        5
Rizzo Matteo            5
Grassl Daniel           5
Jin Boyang              5
Aymoz Kevin             5
Messing Keegan          5
Hanyu Yuzuru            5
Semenenko Evgeni        5
Brown Jason             5
Kvitelashvili Morisi    5
Cha Junhwan             5
Uno Shoma               5
Sadovsky Roman          5
Name: count, dtype: int64

In [285]:
(
    program_df["name"].value_counts()
    .reset_index()
    ["count"].unique()
)

array([5])

In [286]:
program_df.head(10)

Unnamed: 0,rank,name,noc,starting_number,tss,tes,tpcs,deductions,program_component,factor,final_score,J1,J2,J3,J4,J5,J6,J7,J8,J9
3,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,Composition,1.0,9.68,9.75,9.5,10.0,9.75,9.75,9.75,9.25,9.5,9.75
4,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,Interpretation of the Music,1.0,9.64,10.0,9.75,10.0,9.75,9.5,9.5,9.5,9.5,9.5
2,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,Performance,1.0,9.71,10.0,9.5,9.75,10.0,9.5,9.75,9.5,9.5,10.0
0,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,Skating Skills,1.0,9.57,9.75,9.25,9.5,9.75,9.5,9.75,9.5,9.5,9.5
1,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,Transitions,1.0,9.39,9.75,9.25,9.25,9.5,9.5,9.5,9.25,9.0,9.5
8,2,Kagiyama Yuma,JPN,27,108.12,60.91,47.21,0.0,Composition,1.0,9.46,9.5,9.25,9.5,9.5,9.5,9.5,9.5,9.75,9.25
9,2,Kagiyama Yuma,JPN,27,108.12,60.91,47.21,0.0,Interpretation of the Music,1.0,9.46,9.25,9.25,9.75,9.5,9.5,9.5,9.5,9.75,9.25
7,2,Kagiyama Yuma,JPN,27,108.12,60.91,47.21,0.0,Performance,1.0,9.54,9.5,9.5,9.75,9.5,9.5,9.5,9.5,10.0,9.5
5,2,Kagiyama Yuma,JPN,27,108.12,60.91,47.21,0.0,Skating Skills,1.0,9.5,9.5,9.5,9.75,9.25,9.25,9.5,9.5,10.0,9.5
6,2,Kagiyama Yuma,JPN,27,108.12,60.91,47.21,0.0,Transitions,1.0,9.25,9.25,9.25,9.5,9.25,9.0,9.25,9.25,9.5,9.0


In [287]:
program_df.tail(10)

Unnamed: 0,rank,name,noc,starting_number,tss,tes,tpcs,deductions,program_component,factor,final_score,J1,J2,J3,J4,J5,J6,J7,J8,J9
138,28,Selevko Aleksandr,EST,4,65.29,28.79,36.5,0.0,Composition,1.0,7.5,7.25,7.25,7.5,6.75,7.5,8.0,7.75,8.0,7.25
139,28,Selevko Aleksandr,EST,4,65.29,28.79,36.5,0.0,Interpretation of the Music,1.0,7.43,7.0,7.25,7.5,6.75,7.5,8.0,7.75,7.75,7.25
137,28,Selevko Aleksandr,EST,4,65.29,28.79,36.5,0.0,Performance,1.0,7.21,6.75,7.0,7.25,7.0,7.25,7.75,7.25,7.5,7.25
135,28,Selevko Aleksandr,EST,4,65.29,28.79,36.5,0.0,Skating Skills,1.0,7.36,7.0,7.0,7.5,7.25,7.25,7.75,7.5,7.5,7.5
136,28,Selevko Aleksandr,EST,4,65.29,28.79,36.5,0.0,Transitions,1.0,7.0,6.75,6.75,7.0,6.25,7.0,7.5,7.25,7.5,6.75
143,29,Sadovsky Roman,CAN,1,62.77,24.99,37.78,0.0,Composition,1.0,7.68,7.25,7.75,7.5,8.0,7.5,8.0,7.5,7.5,8.0
144,29,Sadovsky Roman,CAN,1,62.77,24.99,37.78,0.0,Interpretation of the Music,1.0,7.64,7.25,7.75,7.5,8.25,7.5,7.75,7.25,7.5,8.25
142,29,Sadovsky Roman,CAN,1,62.77,24.99,37.78,0.0,Performance,1.0,7.36,6.75,7.5,7.25,8.0,6.75,7.5,7.0,7.5,8.0
140,29,Sadovsky Roman,CAN,1,62.77,24.99,37.78,0.0,Skating Skills,1.0,7.71,7.5,7.75,7.75,7.75,7.75,7.75,7.5,7.75,8.0
141,29,Sadovsky Roman,CAN,1,62.77,24.99,37.78,0.0,Transitions,1.0,7.39,7.0,7.5,7.25,7.75,7.5,7.5,7.0,7.25,7.75


In [288]:
program_df["starting_number"].value_counts().reset_index().sort_values(by="starting_number")

Unnamed: 0,starting_number,count
28,1,5
10,2,5
6,3,5
2,4,5
19,5,5
12,6,5
3,7,5
11,8,5
8,9,5
4,10,5


## Merging Element Components with Program Components

In [289]:
element_df_renamed = (
                        element_df
                            .set_index(
                                ['rank', 'name', 'noc', 'starting_number', 'tss', 'tes', 'tpcs', 'deductions']
                                )
                    ) 

program_df_renamed = (
                        program_df
                            .set_index(
                                ['rank', 'name', 'noc', 'starting_number', 'tss', 'tes', 'tpcs', 'deductions']
                                )
                    ) 

data_df = pd.concat([element_df_renamed, program_df_renamed])

cols_at_end = ['J1', 'J2', 'J3', 'J4', 'J5', 'J6', 'J7', 'J8', 'J9']
cols_not_at_end = data_df.columns.difference(cols_at_end).to_list()
new_column_order = cols_not_at_end + cols_at_end
data_df = data_df[new_column_order]

data_df = (
    data_df
        .reset_index()
        .sort_values(by=['rank', 'name', 'noc', 'starting_number', 'tss', 'tes', 'tpcs', 'deductions'])
        .reset_index(drop=True)
)

## Sense Checks on Final data_df

In [290]:
data_df.shape

(348, 26)

In [291]:
# each participant should have 7 element + 5 program rows = 12 rows total
data_df["name"].value_counts().reset_index()["count"].unique()

array([12])

In [292]:
# there should be 29 participants
len(data_df["name"].unique())

29

In [293]:
data_df.head(12)

Unnamed: 0,rank,name,noc,starting_number,tss,tes,tpcs,deductions,base_value,element,element_no,extra_points,factor,final_score,goe,info,program_component,J1,J2,J3,J4,J5,J6,J7,J8,J9
0,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,11.0,4F,1.0,0.0,,15.4,4.4,,,4.0,4.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0
1,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,8.0,3A,2.0,0.0,,10.29,2.29,,,3.0,3.0,3.0,3.0,4.0,2.0,2.0,2.0,4.0
2,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,3.2,CCSp4,3.0,0.0,,4.25,1.05,,,4.0,4.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0
3,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,17.27,4Lz+3T,4.0,1.0,,21.21,3.94,,,4.0,4.0,4.0,3.0,4.0,2.0,2.0,3.0,4.0
4,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,3.9,StSq4,5.0,0.0,,5.85,1.95,,,5.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0
5,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,3.0,FSSp4,6.0,0.0,,4.03,1.03,,,3.0,4.0,4.0,3.0,4.0,3.0,3.0,3.0,4.0
6,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,3.5,CCoSp4,7.0,0.0,,4.95,1.45,,,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,4.0
7,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,,,,,1.0,9.68,,,Composition,9.75,9.5,10.0,9.75,9.75,9.75,9.25,9.5,9.75
8,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,,,,,1.0,9.64,,,Interpretation of the Music,10.0,9.75,10.0,9.75,9.5,9.5,9.5,9.5,9.5
9,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,,,,,1.0,9.71,,,Performance,10.0,9.5,9.75,10.0,9.5,9.75,9.5,9.5,10.0


## Applying Data Pipeline to All Score Files

### Helper Functions

In [294]:
skater_header_pattern = re.compile(
        r"""
        ^(\d+)\s+                  # 1 rank
        (.+?)\s+                   # 2 name
        ([A-Z]{3})\s+              # 3 NOC code
        (\d+)\s+                   # 4 starting num
        (\d+\.\d{2})\s+            # 5 total segment score
        (\d+\.\d{2})\s+            # 6 total element score
        (\d+\.\d{2})\s+            # 7 total program score
        (-?\d+\.\d{2})$            # 8 total deductions
        """,
        re.VERBOSE | re.MULTILINE
    )

element_pattern = re.compile(
    r"""
    ^\s*(\d+)\s+                 # 1 element number
    ([A-Za-z0-9+!*<>q]+)\s+      # 2 element code
    (?:(\S+)\s+)?                # 3 optional info column (x, q, !, etc.)
    ([\d.]+)\s+                  # 4 base value
    (?:\b(x)\b\s+)?              # 5 optional extra points column (x)
    ([\-\d.]+)\s+                # 6 GOE
    ((?:(?:-?\d+)|-)(?:\s+(?:(?:-?\d+)|-)){8}\s+)  # 7 judges scores 
    ([\d.]+)$                    # 8 final score
    """,
    re.VERBOSE | re.MULTILINE
)

program_components_pattern = re.compile(
    r"""
    ^(Skating\s+Skills|Transitions|Performance|Composition|Interpretation\s+of\s+the\s+Music)\s+  # 1 component
    (\d+\.\d{2})\s+                     # 2 factor
    ((?:\d+\.\d{2}\s+){9})              # 3 judge scores
    (\d+\.\d{2})$                       # 4 final score
    """,
    re.VERBOSE | re.MULTILINE
)

In [295]:
def _get_full_pdf_text(pdf_path):
    pages_text = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                pages_text.append(text)

    full_text = "\n".join(pages_text)
    # full_text = full_text.replace(" x ", " ")

    return full_text

def _get_skater_blocks(full_text):
    matches = list(skater_header_pattern.finditer(full_text))
    skater_blocks = []
    for i, m in enumerate(matches):
        start = m.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(full_text)
        skater_blocks.append((m, full_text[start:end]))
    return skater_blocks

def _process_skater_block_element(header, block):
    # read header
    rank = int(header.group(1))
    name = header.group(2).title()
    noc = header.group(3)
    starting_number = int(header.group(4))
    tss = float(header.group(5))
    tes = float(header.group(6))
    tpcs = float(header.group(7))
    deductions = float(header.group(8))

    elements_rows = []
    for m in element_pattern.finditer(block):
        row = {
            "rank": rank,
            "name": name,
            "noc": noc,
            "starting_number": starting_number,
            "tss": tss,
            "tes": tes,
            "tpcs": tpcs,
            "deductions": deductions,
            "element_no": int(m.group(1)),
            "element": m.group(2),
            "info": m.group(3),
            "base_value": float(m.group(4)),
            "extra_points": 1 if m.group(5) else 0,
            "goe": float(m.group(6)),
            "final_score": float(m.group(8)),
        }

        judges = m.group(7).split()
        for i, j in enumerate(judges):
            try:
                row[f"J{i+1}"] = int(j)
            except ValueError:
                row[f"J{i+1}"] = 0
        
        elements_rows.append(row)

    return elements_rows

def _process_skater_block_program(header, block):
    # read header
    rank = int(header.group(1))
    name = header.group(2).title()
    noc = header.group(3)
    starting_number = int(header.group(4))
    tss = float(header.group(5))
    tes = float(header.group(6))
    tpcs = float(header.group(7))
    deductions = float(header.group(8))

    program_rows = []
    for m in program_components_pattern.finditer(block):
        row = {
            "rank": rank,
            "name": name,
            "noc": noc,
            "starting_number": starting_number,
            "tss": tss,
            "tes": tes,
            "tpcs": tpcs,
            "deductions": deductions,
            "program_component": m.group(1),
            "factor": float(m.group(2)),
            "final_score": float(m.group(4)),
        }

        judges = m.group(3).split()
        for i, j in enumerate(judges):
            try:
                row[f"J{i+1}"] = float(j)
            except ValueError:
                row[f"J{i+1}"] = 0
        
        program_rows.append(row)

    return program_rows

def parsing_fsk_score_sheet(pdf_path):
    full_text = _get_full_pdf_text(pdf_path)
    skater_blocks = _get_skater_blocks(full_text)

    elements_rows = []
    program_rows = []

    for header, block in skater_blocks:
        processed_rows = _process_skater_block_element(header, block)
        elements_rows.extend(processed_rows)
        
        processed_rows = _process_skater_block_program(header, block)
        program_rows.extend(processed_rows)

    element_df = pd.DataFrame(elements_rows)
    element_df.sort_values(["rank", "element_no"], inplace=True)

    program_df = pd.DataFrame(program_rows)
    program_df.sort_values(["rank", "program_component"], inplace=True)

    return element_df, program_df

def _add_file_features(pdf_path, data_df):
    is_short_program = 1 if "QUAL" in pdf_path else 0
    data_df["is_short_program"] = is_short_program

    category = "men" if "FSKM" in pdf_path else "women" if "FSKW" in pdf_path else "pairs" if "PAIRS" in pdf_path else ""
    event_type = "team" if "TEAM" in pdf_path else "individual"
    if event_type == "team":
        category = "men" if "MN" in pdf_path else "women" if "LD" in pdf_path else "pairs"
    data_df["category"] = category
    data_df["event_type"] = event_type

    return data_df

def get_fsk_df(pdf_path):
    element_df, program_df = parsing_fsk_score_sheet(pdf_path)
    element_df_renamed = (
                        element_df
                            .set_index(
                                ['rank', 'name', 'noc', 'starting_number', 'tss', 'tes', 'tpcs', 'deductions']
                                )
                    ) 

    program_df_renamed = (
                            program_df
                                .set_index(
                                    ['rank', 'name', 'noc', 'starting_number', 'tss', 'tes', 'tpcs', 'deductions']
                                    )
                        ) 

    data_df = pd.concat([element_df_renamed, program_df_renamed])
    data_df = data_df.assign(
            year = isu_year,
            event = isu_event,
            is_element = lambda x: (~x.element_no.isna()).astype(int)
        )
    cols_at_end = ['J1', 'J2', 'J3', 'J4', 'J5', 'J6', 'J7', 'J8', 'J9']
    cols_not_at_end = data_df.columns.difference(cols_at_end).to_list()
    new_column_order = cols_not_at_end + cols_at_end
    data_df = data_df[new_column_order]

    data_df = (
        data_df
            .reset_index()
            .sort_values(by=['rank', 'name', 'noc', 'starting_number', 'tss', 'tes', 'tpcs', 'deductions'])
            .reset_index(drop=True)
    )
    data_df = _add_file_features(pdf_path, data_df)
    
    return data_df

### Testing Helpers

In [296]:
data_df = get_fsk_df(PDF_PATH)
data_df.head(30)

Unnamed: 0,rank,name,noc,starting_number,tss,tes,tpcs,deductions,base_value,element,element_no,event,extra_points,factor,final_score,goe,info,is_element,program_component,year,J1,J2,J3,J4,J5,J6,J7,J8,J9,is_short_program,category,event_type
0,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,11.0,4F,1.0,owg2022,0.0,,15.4,4.4,,1,,2122,4.0,4.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,1,men,individual
1,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,8.0,3A,2.0,owg2022,0.0,,10.29,2.29,,1,,2122,3.0,3.0,3.0,3.0,4.0,2.0,2.0,2.0,4.0,1,men,individual
2,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,3.2,CCSp4,3.0,owg2022,0.0,,4.25,1.05,,1,,2122,4.0,4.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,1,men,individual
3,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,17.27,4Lz+3T,4.0,owg2022,1.0,,21.21,3.94,,1,,2122,4.0,4.0,4.0,3.0,4.0,2.0,2.0,3.0,4.0,1,men,individual
4,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,3.9,StSq4,5.0,owg2022,0.0,,5.85,1.95,,1,,2122,5.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0,1,men,individual
5,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,3.0,FSSp4,6.0,owg2022,0.0,,4.03,1.03,,1,,2122,3.0,4.0,4.0,3.0,4.0,3.0,3.0,3.0,4.0,1,men,individual
6,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,3.5,CCoSp4,7.0,owg2022,0.0,,4.95,1.45,,1,,2122,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,4.0,1,men,individual
7,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,,,,owg2022,,1.0,9.68,,,0,Composition,2122,9.75,9.5,10.0,9.75,9.75,9.75,9.25,9.5,9.75,1,men,individual
8,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,,,,owg2022,,1.0,9.64,,,0,Interpretation of the Music,2122,10.0,9.75,10.0,9.75,9.5,9.5,9.5,9.5,9.5,1,men,individual
9,1,Chen Nathan,USA,28,113.97,65.98,47.99,0.0,,,,owg2022,,1.0,9.71,,,0,Performance,2122,10.0,9.5,9.75,10.0,9.5,9.75,9.5,9.5,10.0,1,men,individual


In [297]:
data_df[lambda x: x.name == "Lee Sihyeong"]

Unnamed: 0,rank,name,noc,starting_number,tss,tes,tpcs,deductions,base_value,element,element_no,event,extra_points,factor,final_score,goe,info,is_element,program_component,year,J1,J2,J3,J4,J5,J6,J7,J8,J9,is_short_program,category,event_type
312,27,Lee Sihyeong,KOR,7,65.69,30.75,35.94,-1.0,9.7,4S,1.0,owg2022,0.0,,7.21,-2.49,,1,,2122,-3.0,-3.0,-3.0,-2.0,-3.0,-1.0,-2.0,-3.0,-2.0,1,men,individual
313,27,Lee Sihyeong,KOR,7,65.69,30.75,35.94,-1.0,8.0,3Aq,2.0,owg2022,0.0,,6.86,-1.14,q,1,,2122,-1.0,-2.0,-1.0,0.0,-2.0,-1.0,-3.0,-2.0,-1.0,1,men,individual
314,27,Lee Sihyeong,KOR,7,65.69,30.75,35.94,-1.0,3.0,FSSp4,3.0,owg2022,0.0,,3.43,0.43,,1,,2122,1.0,3.0,1.0,0.0,2.0,2.0,1.0,1.0,2.0,1,men,individual
315,27,Lee Sihyeong,KOR,7,65.69,30.75,35.94,-1.0,5.19,3Lz<+COMBO,4.0,owg2022,1.0,,2.83,-2.36,<,1,,2122,-5.0,-5.0,-5.0,-5.0,-5.0,-5.0,-5.0,-5.0,-5.0,1,men,individual
316,27,Lee Sihyeong,KOR,7,65.69,30.75,35.94,-1.0,2.6,StSq2,5.0,owg2022,0.0,,2.9,0.3,,1,,2122,2.0,1.0,0.0,1.0,2.0,1.0,1.0,1.0,1.0,1,men,individual
317,27,Lee Sihyeong,KOR,7,65.69,30.75,35.94,-1.0,3.2,CCSp4,6.0,owg2022,0.0,,3.57,0.37,,1,,2122,1.0,3.0,0.0,1.0,2.0,1.0,2.0,0.0,1.0,1,men,individual
318,27,Lee Sihyeong,KOR,7,65.69,30.75,35.94,-1.0,3.5,CCoSp4,7.0,owg2022,0.0,,3.95,0.45,,1,,2122,2.0,2.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,1,men,individual
319,27,Lee Sihyeong,KOR,7,65.69,30.75,35.94,-1.0,,,,owg2022,,1.0,7.29,,,0,Composition,2122,7.5,7.0,6.5,7.25,7.25,7.25,7.75,7.25,7.5,1,men,individual
320,27,Lee Sihyeong,KOR,7,65.69,30.75,35.94,-1.0,,,,owg2022,,1.0,7.14,,,0,Interpretation of the Music,2122,7.25,7.0,6.5,7.25,7.25,7.0,7.5,7.0,7.25,1,men,individual
321,27,Lee Sihyeong,KOR,7,65.69,30.75,35.94,-1.0,,,,owg2022,,1.0,7.11,,,0,Performance,2122,7.25,6.75,6.25,7.5,7.0,7.0,7.25,7.0,7.5,1,men,individual


In [298]:
data_df["name"].value_counts().reset_index()["count"].unique()

array([12])

### All Scoresheet Data

In [299]:
from pathlib import Path

data_dfs_dict = {} # path to df dict
data_df = pd.DataFrame()

file_names = [item.name for item in Path(dir_name).iterdir() if item.is_file()]
valid_score_files = []
skip = False
output_file_path = ""
for f in file_names:
    if ".csv" in f:
        skip = True
        output_file_path = dir_name + "/" +f
        break
if skip:
    print(f"Data Directory: {dir_name} already processed")
    data_df = pd.read_csv(output_file_path)
else:
    for f in file_names:
        print(f"PROCESSING {f}...")
        if "DANCE" not in f:
            print(f"VALID SCORE SHEET...")
            valid_score_files.append(f)
            
            full_path = dir_name + "/" + f
            df = get_fsk_df(full_path)
            data_dfs_dict[f] = df
        print(f"DONE")
        print("-------------------")

PROCESSING FSKXTEAM--------------FNL-0002LD--_JudgesDetailsperSkater.pdf...
VALID SCORE SHEET...
DONE
-------------------
PROCESSING FSKXTEAM--------------QUAL0002LD--_JudgesDetailsperSkater.pdf...
VALID SCORE SHEET...
DONE
-------------------
PROCESSING FSKXTEAM--------------QUAL0001MN--_JudgesDetailsperSkater.pdf...
VALID SCORE SHEET...
DONE
-------------------
PROCESSING FSKXTEAM--------------FNL-0001MN--_JudgesDetailsperSkater.pdf...
VALID SCORE SHEET...
DONE
-------------------
PROCESSING FSKMSINGLES-----------FNL-000100--_JudgesDetailsperSkater.pdf...
VALID SCORE SHEET...
DONE
-------------------
PROCESSING FSKWSINGLES-----------FNL-000100--_JudgesDetailsperSkater.pdf...
VALID SCORE SHEET...
DONE
-------------------
PROCESSING FSKMSINGLES-----------QUAL000100--_JudgesDetailsperSkater.pdf...
VALID SCORE SHEET...
DONE
-------------------
PROCESSING FSKWSINGLES-----------QUAL000100--_JudgesDetailsperSkater.pdf...
VALID SCORE SHEET...
DONE
-------------------
PROCESSING FSKXICEDANCE-

In [300]:
len(data_dfs_dict)

14

In [301]:
if not skip:
    print(valid_score_files[0])
    data_dfs_dict[valid_score_files[0]].head()


FSKXTEAM--------------FNL-0002LD--_JudgesDetailsperSkater.pdf


In [302]:
if not skip:
    data_dfs_dict[valid_score_files[0]]["name"].value_counts()

In [303]:
if not skip:
    for f in valid_score_files:
        data_df = pd.concat([data_df, data_dfs_dict[f]])
    data_df.reset_index(drop=True, inplace=True)

    # output csv
    data_df.to_csv(OUTPUT_CSV, index=False)

    print(f"Saved {len(data_df)} rows to {OUTPUT_CSV}")

Saved 2756 rows to ./owg2022/owg2022.csv


In [304]:
data_df.shape

(2756, 32)

In [305]:
data_df

Unnamed: 0,rank,name,noc,starting_number,tss,tes,tpcs,deductions,base_value,element,element_no,event,extra_points,factor,final_score,goe,info,is_element,program_component,year,J1,J2,J3,J4,J5,J6,J7,J8,J9,is_short_program,category,event_type
0,1,Valieva Kamila,ROC,5,178.9200,105.2500,74.6700,-1.0000,9.7000,4S,1.0000,owg2022,0.0000,,13.7200,4.0200,,1,,2122,4.0000,4.0000,5.0000,4.0000,4.0000,3.0000,4.0000,4.0000,5.0000,0,women,team
1,1,Valieva Kamila,ROC,5,178.9200,105.2500,74.6700,-1.0000,8.0000,3A,2.0000,owg2022,0.0000,,11.3100,3.3100,,1,,2122,4.0000,4.0000,4.0000,3.0000,5.0000,3.0000,5.0000,4.0000,5.0000,0,women,team
2,1,Valieva Kamila,ROC,5,178.9200,105.2500,74.6700,-1.0000,13.7000,4T+3T,3.0000,owg2022,0.0000,,16.8200,3.1200,,1,,2122,3.0000,4.0000,3.0000,3.0000,4.0000,3.0000,3.0000,3.0000,4.0000,0,women,team
3,1,Valieva Kamila,ROC,5,178.9200,105.2500,74.6700,-1.0000,4.9000,3Lo,4.0000,owg2022,0.0000,,6.5100,1.6100,,1,,2122,3.0000,3.0000,4.0000,3.0000,5.0000,2.0000,3.0000,3.0000,4.0000,0,women,team
4,1,Valieva Kamila,ROC,5,178.9200,105.2500,74.6700,-1.0000,3.2000,FCSp4,5.0000,owg2022,0.0000,,4.7100,1.5100,,1,,2122,5.0000,5.0000,5.0000,5.0000,5.0000,4.0000,4.0000,5.0000,4.0000,0,women,team
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2751,5,Knierim Alexa / Frazier Brandon,USA,3,128.9700,62.7400,66.2300,0.0000,,,,owg2022,,1.6000,8.3900,,,0,Composition,2122,8.5000,8.7500,8.0000,8.5000,8.0000,8.2500,8.7500,8.2500,8.5000,0,pairs,team
2752,5,Knierim Alexa / Frazier Brandon,USA,3,128.9700,62.7400,66.2300,0.0000,,,,owg2022,,1.6000,8.3200,,,0,Interpretation of the Music,2122,8.2500,8.5000,6.7500,8.5000,8.0000,8.5000,8.5000,8.0000,8.5000,0,pairs,team
2753,5,Knierim Alexa / Frazier Brandon,USA,3,128.9700,62.7400,66.2300,0.0000,,,,owg2022,,1.6000,8.1100,,,0,Performance,2122,8.2500,8.2500,7.0000,8.2500,7.7500,8.0000,8.5000,8.0000,8.2500,0,pairs,team
2754,5,Knierim Alexa / Frazier Brandon,USA,3,128.9700,62.7400,66.2300,0.0000,,,,owg2022,,1.6000,8.3600,,,0,Skating Skills,2122,8.5000,8.5000,7.7500,8.5000,8.0000,8.5000,8.2500,8.2500,8.5000,0,pairs,team


In [306]:
data_df[lambda x: x.name == "Chen Nathan"]

Unnamed: 0,rank,name,noc,starting_number,tss,tes,tpcs,deductions,base_value,element,element_no,event,extra_points,factor,final_score,goe,info,is_element,program_component,year,J1,J2,J3,J4,J5,J6,J7,J8,J9,is_short_program,category,event_type
205,1,Chen Nathan,USA,8,111.71,63.85,47.86,0.0,11.0,4F,1.0,owg2022,0.0,,15.24,4.24,,1,,2122,3.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,4.0,1,men,team
206,1,Chen Nathan,USA,8,111.71,63.85,47.86,0.0,8.0,3A,2.0,owg2022,0.0,,9.83,1.83,,1,,2122,2.0,1.0,4.0,2.0,2.0,3.0,2.0,2.0,3.0,1,men,team
207,1,Chen Nathan,USA,8,111.71,63.85,47.86,0.0,3.2,CCSp4,3.0,owg2022,0.0,,4.07,0.87,,1,,2122,2.0,2.0,3.0,3.0,3.0,3.0,2.0,4.0,3.0,1,men,team
208,1,Chen Nathan,USA,8,111.71,63.85,47.86,0.0,17.27,4Lz+3T,4.0,owg2022,1.0,,20.72,3.45,,1,,2122,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,1,men,team
209,1,Chen Nathan,USA,8,111.71,63.85,47.86,0.0,3.9,StSq4,5.0,owg2022,0.0,,5.57,1.67,,1,,2122,4.0,4.0,4.0,5.0,4.0,4.0,5.0,5.0,4.0,1,men,team
210,1,Chen Nathan,USA,8,111.71,63.85,47.86,0.0,3.0,FSSp4,6.0,owg2022,0.0,,3.77,0.77,,1,,2122,2.0,3.0,3.0,2.0,2.0,2.0,3.0,3.0,3.0,1,men,team
211,1,Chen Nathan,USA,8,111.71,63.85,47.86,0.0,3.5,CCoSp4,7.0,owg2022,0.0,,4.65,1.15,,1,,2122,3.0,3.0,4.0,3.0,3.0,3.0,3.0,4.0,4.0,1,men,team
212,1,Chen Nathan,USA,8,111.71,63.85,47.86,0.0,,,,owg2022,,1.0,9.61,,,0,Composition,2122,9.75,9.5,9.75,9.75,9.75,9.0,9.5,9.5,9.5,1,men,team
213,1,Chen Nathan,USA,8,111.71,63.85,47.86,0.0,,,,owg2022,,1.0,9.68,,,0,Interpretation of the Music,2122,9.5,10.0,9.75,9.75,9.75,9.25,9.75,9.75,9.5,1,men,team
214,1,Chen Nathan,USA,8,111.71,63.85,47.86,0.0,,,,owg2022,,1.0,9.64,,,0,Performance,2122,9.5,9.75,9.75,9.75,9.5,9.5,9.5,9.75,9.75,1,men,team
