In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
def drugi_etap_wyniki(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    table = soup.findAll("table")[0]
    df = pd.read_html(str(table), header=0)[0]
    if "lp" in df.columns:
        df["lp."] = df["lp"]
    if "Lp." in df.columns:
        df["lp."] = df["Lp."]
    df["name"] = (df["imię"] + " " + df["nazwisko"])
    return df[["lp.", "name"]]

In [3]:
def final_wyniki(url):
    def update_unknown_lp(df):
        num = df["lp."][0]
        df["lp."] = [num]* len(df)
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    tables = soup.findAll("table", {"class":"results_table"})
    dfs = [pd.read_html(str(table), header=0)[0] for table in tables]
    update_unknown_lp(dfs[-1])
    dfs = [df[["lp.", "imię i nazwisko"]] for df in dfs]
    final_df = pd.concat(dfs)
    final_df["name"] = final_df["imię i nazwisko"]
    return final_df.reset_index()[["lp.", "name"]]

In [4]:
def longestCommonPrefix(strs):
    longest_pre = ""
    if not strs: 
        return longest_pre
    shortest_str = min(strs, key=len)
    for i in range(len(shortest_str)):
        if all([x.startswith(shortest_str[:i+1]) for x in strs]):
            longest_pre = shortest_str[:i+1]
        else:
            break
    return longest_pre

In [5]:
def longestCommonSuffix(strs):
    strs = [s[::-1] for s in strs]
    return longestCommonPrefix(strs)[::-1]

In [6]:
def podobnosc(strs):
    return len(longestCommonPrefix(strs)) + len(longestCommonSuffix(strs))

In [7]:
def repair_names(df2, df3):
    x = list(set(df2["name"]) - set(df3["name"]))
    y = list(set(df3["name"]) - set(df2["name"]))
    assert(len(x) == len(y))
    names2 = list(df2["name"])
    for a in x:
        z = [(podobnosc([a, b]), b) for b in y]
        names2 = [max(z)[1] if aa == a else aa for aa in names2]
    df2["name"] = names2
    assert(set(df2["name"]) == set(df3["name"]))
    assert(len(df2) == len(df3))

In [8]:
def get_oi_table(i):
    try:
        df2 = drugi_etap_wyniki("https://oi.edu.pl/l/{}oi_2etap_wyniki/".format(i))
    except Exception as e:
        print("error drugi etap {}".format(i))
        print(e)
    try:
        df3 = final_wyniki("https://oi.edu.pl/l/{}oi_3etap_wyniki/".format(i))
    except:
        print("error final {}".format(i))
    
    repair_names(df2, df3)
    df2["2etap"] = df2["lp."]
    df3["3etap"] = df3["lp."]
    return df2[["2etap", "name"]].merge(df3[["3etap", "name"]], on="name")

In [9]:
for oi in range(18, 27):
    print("{} OI".format(oi))
    df = get_oi_table(oi)
    finalists = len(df)
    laureates = min(finalists // 2, max(df["3etap"]) - 1)
    #print(df[["2etap", "3etap"]].corr())
    wannabe_laureates = df[:laureates]
    x = len(wannabe_laureates)
    y = len(wannabe_laureates.loc[wannabe_laureates["3etap"] > laureates])
    print("Wśród {} wannabe laureatów {} by nie dostało laureata. Jest to {:.2f}%".format(x, y, 100.0 * y / x))
    print("", flush=True)

18 OI
Wśród 35 wannabe laureatów 12 by nie dostało laureata. Jest to 34.29%

19 OI
Wśród 46 wannabe laureatów 17 by nie dostało laureata. Jest to 36.96%

20 OI
Wśród 48 wannabe laureatów 16 by nie dostało laureata. Jest to 33.33%

21 OI
Wśród 49 wannabe laureatów 16 by nie dostało laureata. Jest to 32.65%

22 OI
Wśród 49 wannabe laureatów 17 by nie dostało laureata. Jest to 34.69%

23 OI
Wśród 48 wannabe laureatów 14 by nie dostało laureata. Jest to 29.17%

24 OI
Wśród 47 wannabe laureatów 13 by nie dostało laureata. Jest to 27.66%

25 OI
Wśród 48 wannabe laureatów 15 by nie dostało laureata. Jest to 31.25%

26 OI
Wśród 47 wannabe laureatów 17 by nie dostało laureata. Jest to 36.17%

