In [9]:
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
import requests
import io
import tabula
import pandas as pd
import numpy as np
from pandas import DataFrame

In [10]:
BASE_URL = "https://dijlovasok.hu/index.php/programgyujt"

In [11]:
def get_contents_of_page (url, headers = None) :
    req = requests.get(url, headers = headers if headers != None else {
        'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"
    })
    
    return req.content

In [12]:
def soupify (content) :
    return BeautifulSoup(content, "html.parser")

In [13]:
def get_links_from_main_page ():
    soup = soupify(get_contents_of_page(BASE_URL))
    # print(soup.prettify())
    program_hrefs = list(
        set(
            map(
                lambda href: href["href"], 
                filter(lambda href: href["href"].startswith("/index.php/programgyujt/"), 
                    soup.find_all("a", href = True)
                )
            )
        )
    )
    return program_hrefs    

In [14]:
def get_pdf_links_from_page (url):
    soup: BeautifulSoup = soupify(get_contents_of_page(url))
    pdf_hrefs = list(
            set(
                map(
                    lambda href: href["href"], 
                    filter(lambda href: href["href"].endswith(".pdf"), 
                        soup.find_all("a", href = True)
                    )
                )
            )
        )
    return pdf_hrefs

In [15]:
def get_pdf_dataframes () :

    pdf_texts = {}

    for page_url in get_links_from_main_page() :
        
        print(f"Scraping page {page_url}...")
        
        page_content = {}
        
        for pdf_url in get_pdf_links_from_page(BASE_URL + page_url) :
            
            print(f"Downloading pdf {pdf_url}...")
                        
            user_agent= 'Mozilla/5.0 (X11; Windows; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36'

            dfs = tabula.read_pdf(
                pdf_url, 
                multiple_tables=True, 
                pages="all", 
                guess=True,
                encoding="cp1252",
                user_agent=user_agent,
                # output_format="dataframe",
                pandas_options={
                    # "names": ["ID", "Position", "Description", "Notes"]
                    "header": 0
                },
                lattice=True,
                silent=True
            )   
                
            if len(dfs) == 0 :
                continue
                
            page_content[pdf_url] = dfs
            
        pdf_texts[page_url] = page_content
        
    return pdf_texts

In [16]:
database = get_pdf_dataframes()

Scraping page /index.php/programgyujt/1008-fiatal-lovak-programjai...
Downloading pdf http://www.dijlovasok.hu/dokumentum/Hatarozatok/2018/11_2018.pdf...
Downloading pdf http://www.dijlovasok.hu/dokumentum/Programgyujtemeny_2018/FD/FEI_7_FD_MIN.pdf...
Downloading pdf http://www.dijlovasok.hu/dokumentum/Programgyujtemeny_2018/FD/FEI_6_FD_DONTO.pdf...
Downloading pdf http://www.dijlovasok.hu/dokumentum/Programgyujtemeny_2018/FD/FEI_4_FD.pdf...
Downloading pdf http://www.dijlovasok.hu/dokumentum/Programgyujtemeny_2018/FD/FEI_5_FD_DONTO.pdf...
Downloading pdf http://www.dijlovasok.hu/dokumentum/Programgyujtemeny_2018/FD/FEI_7_FD_DONTO.pdf...
Downloading pdf http://www.dijlovasok.hu/dokumentum/Hatarozatok/2018/10_2018.pdf...
Downloading pdf http://www.dijlovasok.hu/dokumentum/Programgyujtemeny_2018/FD/FEI_6_FD_MIN.pdf...
Downloading pdf http://www.dijlovasok.hu/dokumentum/Programgyujtemeny_2018/FD/5_FD_A4.pdf...
Downloading pdf http://www.dijlovasok.hu/dokumentum/Hatarozatok/2018/9_2018.pdf

In [17]:
def inspect_database(database) :
    for page_url in database:
        page = database[page_url]
        for program_url in page:
            program = page[program_url]
            for df in program:
                yield df

In [18]:
next(inspect_database(database))

Unnamed: 0.1,Unnamed: 0,Bajnokság megnevezése,Unnamed: 1,Unnamed: 2,Minõsítõ versenyszámok,Unnamed: 3
0,4 éves lovak bajnoksága,FEI 4 éves fiatal díjló program,,,,
1,5 éves lovak bajnoksága,5 éves fiatal díjló program A3\r5 éves fiatal ...,,,,
2,6 éves lovak bajnoksága,FEI 5 éves fiatal díjló minõsítõ program\rFEI ...,,,,


In [102]:
list(list(database.values())[0].values())[3][1]

Unnamed: 0.1,Unnamed: 0,Feladat
0,AXC\rC\rCMBFA\rAXC\rHK\rKA\rA és F között\rFXH...,Belovaglás munkaügetésben a középvonalon állj ...


In [103]:
from pandas import DataFrame
from typing import List

def convert_program_dfs_to_useful_df (dfs: List[DataFrame]) :
            
    dfs = list(filter(lambda df: len(df) != 0, dfs))
    
    if len(dfs) == 0 :
        return None
    
    if len(dfs) == 1 :
        if len(dfs[0]) != 0 :
            return dfs[0]
        else :
            return None
    
    # joining
    
    result = dfs[0]
    
    column_num = len(result.columns)
    
    for df in dfs[1:] :
        
        if len(df.columns) == column_num :
            result = pd.concat([result, df], ignore_index = True)
            continue
        
        if len(df.columns) == column_num + 2 :
            
            columns = df.columns
            
            df = df.drop(columns[0], axis=1)
            df = df.drop(columns[-1], axis=1)
            
            # rename unnamed columns
            
            def column_renamer(column):
                if str(column).startswith("Unnamed") == False :
                    return column
                num = int(column.split(" ")[1]) - 1
                return "Unnamed: " + str(num)
            
            df = df.rename(column_renamer, axis=1)
            
            result = pd.concat([result, df], ignore_index = True)
            continue
        
        # raise Exception(f"WARNING: df column width mismatch: cannot append {len(df.columns)} to {column_num}!")
        
    return result

In [104]:

def convert_database_to_useful_dfs (database) :
    for page in database.values() :
        for program_url in page :
            program = page[program_url]
            try :
                yield (program_url, convert_program_dfs_to_useful_df(program))
            except Exception as e :
                print(f"URL in question: {program_url}")
                raise e

In [105]:
from typing import Tuple

def is_df_program(program: Tuple[str, DataFrame]) :
    
    url, df = program
    
    if "Feladat" not in df.columns :
        return False
    
    if len(df) <= 5 :
        return False
    
    if "Négyszöget" not in df["Feladat"][pd.Series.last_valid_index(df["Feladat"])] :
        return False
    
    try :
        if int(df["Unnamed: 0"][0]) != 1 :
            return False
    except ValueError :
        return False
    
    return True

In [112]:
filtered_programs = list(filter(is_df_program, convert_database_to_useful_dfs(database)))

In [115]:
def rename_columns (program: Tuple[str, DataFrame]) :
    url, df = program
    df = df.rename({
        "Unnamed: 0": "Id",
        "Unnamed: 1": "Letter"
    }, axis=1, inplace=False)
    return (url, df)

In [158]:
programs = list(map(rename_columns, filtered_programs))

In [159]:
def keep_only_useful_columns (program: Tuple[str, DataFrame]) :
    url, df = program
    columns = df.columns
    columns = filter(lambda c: c not in ["Id", "Letter", "Feladat"], columns)
    for column in columns :
        df = df.drop(column, axis = 1)
    return (url, df)

In [160]:
programs = list(map(keep_only_useful_columns, programs))

In [161]:
def keep_only_useful_rows (program: Tuple[str, DataFrame]) :
    url, df = program
    return (url, df.dropna(axis=0, how="any", subset=["Id"]))

In [162]:
programs = list(map(keep_only_useful_rows, programs))

In [163]:
len(programs)

42

In [164]:
def tokenize_exercise (letters: str, exercise: str) :
    
    if str(exercise) != "nan" :
        rows = str(exercise).split("\r")
        words = [str(row).split() for row in rows]
    else :
        words = []
    
    if str(letters) != "nan" :
        letter_rows = str(letters).split('\r')
        letter_words = [str(letter_row).split() for letter_row in letter_rows]
    else :
        letter_words = []
    
    return (letter_words, words)

In [165]:
def tokenize_program(program: Tuple[str, DataFrame]) :
    url, df = program
    tokens = []
    for index, row in df.iterrows() :
        tokens.append(tokenize_exercise(row["Letter"], row["Feladat"]))
    return (url, tokens)

In [166]:
programs = [tokenize_program(program) for program in programs]

In [167]:
programs[3]

('https://www.dijlovasok.hu/dokumentum/Programgyujtemeny_2018/M/LSZ2.pdf',
 [([['A'], ['X'], [], ['C']],
   [['Belovaglás', 'összeszedett', 'ügetésben'],
    ['Állj,', 'köszönés,', 'elindulás', 'összeszedett'],
    ['ügetésbe'],
    ['Jobb', 'kézre']]),
  ([['B']], [['Kiskör,', '10m']]),
  ([['B-F']], [['Vállat', 'be']]),
  ([['A'], ['D', 'és', 'X-M', 'között']],
   [['Középbõl'], ['Jobbra', 'oldaljárás']]),
  ([['M-C-H-E'], ['E']], [['Összeszedett', 'ügetés'], ['Kiskör,', '10m']]),
  ([['E-K']], [['Vállat', 'be']]),
  ([['A'], ['D', 'ÉS', 'X-H', 'között']],
   [['Középbõl'], ['Balra', 'oldaljárás']]),
  ([['(H-M)'], ['M-K'], ['K']],
   [['(Összeszedett', 'ügetés)'],
    ['Átlóváltás,', 'középügetés'],
    ['Összeszedett', 'ügetés']]),
  ([],
   [['Átmenetek', 'összeszedett', 'ügetésbõl'],
    ['középügetésbe', 'és', 'középügetésbõl'],
    ['összeszedett', 'ügetésbe']]),
  ([['K-A-B'], ['B']],
   [['Összeszedett', 'ügetés'],
    ['Röviden', 'hátra', 'arc,', 'abból', 'középlépés']]),
  