In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.subplots as make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output, State
from dash import Dash, dash_table

In [2]:
data = pd.read_excel("./data/InternationalPenalties.xlsx", sheet_name=["WorldCup", "Euros"])
output = pd.read_excel("./output/Week28_output.xlsx", sheet_name=["Win %", "Penalty Position", "Score %"])

In [3]:
world_cup = data["WorldCup"].copy()
euros = data["Euros"].copy()

### Data Preprocessing

In [4]:
def find_penalty_type(string):
    import re
    regex = re.compile("scored")
    if string == "Unknown":
        return np.nan
    match_obj = regex.search(string)
    if match_obj != None:
        return match_obj.group()
    else:
        return "missed"

In [5]:
# world cup data
world_cup["Winning team Taker"] = world_cup["Winning team Taker"].fillna("No Kicker")
world_cup["Losing team Taker"] = world_cup["Losing team Taker"].fillna("No Kicker")

winner_penalty = world_cup["Winning team Taker"].map(lambda x: find_penalty_type(x))
loser_penalty = world_cup["Losing team Taker"].map(lambda x: find_penalty_type(x))

world_cup["Winner Penalty type"] = winner_penalty
world_cup["Loser Penalty type"] = loser_penalty

In [6]:
winner_penalty.value_counts()

scored    120
missed     21
Name: Winning team Taker, dtype: int64

In [7]:
loser_penalty.value_counts()

scored    76
missed    62
Name: Losing team Taker, dtype: int64

In [8]:
# euro data
euros["Winning team Taker"] = euros["Winning team Taker"].fillna("No Kicker")
euros["Losing team Taker"] = euros["Losing team Taker"].fillna("No Kicker")

euros_winner_penalty = euros["Winning team Taker"].map(lambda x: find_penalty_type(x))
euros_loser_penalty = euros["Losing team Taker"].map(lambda x: find_penalty_type(x))

euros["Winner Penalty type"] = euros_winner_penalty
euros["Loser Penalty type"] = euros_loser_penalty

In [9]:
euros_winner_penalty.value_counts()

scored    105
missed     14
Name: Winning team Taker, dtype: int64

In [10]:
euros_loser_penalty.value_counts()

scored    73
missed    40
Name: Losing team Taker, dtype: int64

#### Clean any fields, correctly format the date the penalty was taken, & group the two German countries (eg, West Germany & Germany)

In [11]:
world_cup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   No.                  146 non-null    int64         
 1   Penalty Number       146 non-null    int64         
 2   Event Year           146 non-null    object        
 3   Winner               146 non-null    object        
 4   Full Time Score      146 non-null    object        
 5   Loser                146 non-null    object        
 6   Winning Team GK      146 non-null    object        
 7   Winning team Taker   146 non-null    object        
 8   Losing team Taker    146 non-null    object        
 9   Losing Team GK       146 non-null    object        
 10  Round                146 non-null    object        
 11  Date                 146 non-null    datetime64[ns]
 12  Winner Penalty type  141 non-null    object        
 13  Loser Penalty type   138 non-null  

In [12]:
# world cup data
world_cup["Event Year"] = pd.to_datetime(world_cup["Event Year"].str.replace(",", ""))
world_cup["Event Year"] = world_cup["Event Year"].map(lambda x: x.year)

world_cup["Winner"] = world_cup["Winner"].str.strip()
world_cup["Loser"] = world_cup["Loser"].str.strip()

In [13]:
world_cup.loc[world_cup["Winner"] == "West Germany", "Winner"] = "Germany"
world_cup[world_cup["Winner"] == "Germany"].shape

(19, 14)

In [14]:
# euro data
euros["Event Year"] = pd.to_datetime(euros["Event Year"].str.replace(",", ""))
euros["Event Year"] = euros["Event Year"].map(lambda x: x.year)

euros["Winner"] = euros["Winner"].str.strip()
euros["Loser"] = euros["Loser"].str.strip()

In [15]:
euros.loc[euros["Loser"] == "West Germany", "Loser"] = "Germany"
euros[euros["Winner"] == "Germany"].shape

(15, 14)

In [16]:
world_cup.head()

Unnamed: 0,No.,Penalty Number,Event Year,Winner,Full Time Score,Loser,Winning Team GK,Winning team Taker,Losing team Taker,Losing Team GK,Round,Date,Winner Penalty type,Loser Penalty type
0,1,1,1982,Germany,3–3,France,Schumacher,Kaltz Penalty scored,Penalty scored Giresse,Ettori,Semi-finals,2021-07-08,scored,scored
1,1,2,1982,Germany,3–3,France,Schumacher,Breitner Penalty scored,Penalty scored Amoros,Ettori,Semi-finals,2021-07-08,scored,scored
2,1,3,1982,Germany,3–3,France,Schumacher,Stielike Penalty missed,Penalty scored Rocheteau,Ettori,Semi-finals,2021-07-08,missed,scored
3,1,4,1982,Germany,3–3,France,Schumacher,Littbarski Penalty scored,Penalty missed Six,Ettori,Semi-finals,2021-07-08,scored,missed
4,1,5,1982,Germany,3–3,France,Schumacher,Rummenigge Penalty scored,Penalty scored Platini,Ettori,Semi-finals,2021-07-08,scored,scored


In [17]:
euros.head()

Unnamed: 0,No.,Penalty Number,Event Year,Winner,Full Time Score,Loser,Winning team GK,Winning team Taker,Losing team Taker,Losing team GK,Round,Date,Winner Penalty type,Loser Penalty type
0,1,1,1976,Czechoslovakia,2–2,Germany,Viktor,Masný Penalty scored,Penalty scored Bonhof,Maier,Final,2021-06-20,scored,scored
1,1,2,1976,Czechoslovakia,2–2,Germany,Viktor,Nehoda Penalty scored,Penalty scored Flohe,Maier,Final,2021-06-20,scored,scored
2,1,3,1976,Czechoslovakia,2–2,Germany,Viktor,Ondruš Penalty scored,Penalty scored Bongartz,Maier,Final,2021-06-20,scored,scored
3,1,4,1976,Czechoslovakia,2–2,Germany,Viktor,Jurkemik Penalty scored,Penalty missed Hoeneß,Maier,Final,2021-06-20,scored,missed
4,1,5,1976,Czechoslovakia,2–2,Germany,Viktor,Panenka Penalty scored,Unknown,Maier,Final,2021-06-20,scored,


In [18]:
world_cup["Winning team Kicker"] = world_cup["Winning team Taker"].str.split("Penalty").apply(pd.Series)[0]
world_cup["Winning team Kicker"] = world_cup["Winning team Kicker"].str.strip()

In [19]:
world_cup["Losing team Kicker"] = world_cup["Losing team Taker"].str.replace("Penalty", "").str.replace("scored", "").str.replace("missed", "")
world_cup["Losing team Kicker"] = world_cup["Losing team Kicker"].str.strip()

In [20]:
world_cup.head()

Unnamed: 0,No.,Penalty Number,Event Year,Winner,Full Time Score,Loser,Winning Team GK,Winning team Taker,Losing team Taker,Losing Team GK,Round,Date,Winner Penalty type,Loser Penalty type,Winning team Kicker,Losing team Kicker
0,1,1,1982,Germany,3–3,France,Schumacher,Kaltz Penalty scored,Penalty scored Giresse,Ettori,Semi-finals,2021-07-08,scored,scored,Kaltz,Giresse
1,1,2,1982,Germany,3–3,France,Schumacher,Breitner Penalty scored,Penalty scored Amoros,Ettori,Semi-finals,2021-07-08,scored,scored,Breitner,Amoros
2,1,3,1982,Germany,3–3,France,Schumacher,Stielike Penalty missed,Penalty scored Rocheteau,Ettori,Semi-finals,2021-07-08,missed,scored,Stielike,Rocheteau
3,1,4,1982,Germany,3–3,France,Schumacher,Littbarski Penalty scored,Penalty missed Six,Ettori,Semi-finals,2021-07-08,scored,missed,Littbarski,Six
4,1,5,1982,Germany,3–3,France,Schumacher,Rummenigge Penalty scored,Penalty scored Platini,Ettori,Semi-finals,2021-07-08,scored,scored,Rummenigge,Platini


In [21]:
euros["Winning team Kicker"] = euros["Winning team Taker"].str.split("Penalty").apply(pd.Series)[0]
euros["Winning team Kicker"] = euros["Winning team Kicker"].str.strip()

euros["Losing team Kicker"] = euros["Losing team Taker"].str.replace("Penalty", "").str.replace("scored", "").str.replace("missed", "")
euros["Losing team Kicker"] = euros["Losing team Kicker"].str.strip()

In [22]:
euros.head()

Unnamed: 0,No.,Penalty Number,Event Year,Winner,Full Time Score,Loser,Winning team GK,Winning team Taker,Losing team Taker,Losing team GK,Round,Date,Winner Penalty type,Loser Penalty type,Winning team Kicker,Losing team Kicker
0,1,1,1976,Czechoslovakia,2–2,Germany,Viktor,Masný Penalty scored,Penalty scored Bonhof,Maier,Final,2021-06-20,scored,scored,Masný,Bonhof
1,1,2,1976,Czechoslovakia,2–2,Germany,Viktor,Nehoda Penalty scored,Penalty scored Flohe,Maier,Final,2021-06-20,scored,scored,Nehoda,Flohe
2,1,3,1976,Czechoslovakia,2–2,Germany,Viktor,Ondruš Penalty scored,Penalty scored Bongartz,Maier,Final,2021-06-20,scored,scored,Ondruš,Bongartz
3,1,4,1976,Czechoslovakia,2–2,Germany,Viktor,Jurkemik Penalty scored,Penalty missed Hoeneß,Maier,Final,2021-06-20,scored,missed,Jurkemik,Hoeneß
4,1,5,1976,Czechoslovakia,2–2,Germany,Viktor,Panenka Penalty scored,Unknown,Maier,Final,2021-06-20,scored,,Panenka,Unknown


#### Null value replacement

In [23]:
world_cup.loc[world_cup["Winner"] == "South Korea", "Winner Penalty type"] = "scored"
world_cup.loc[19, "Winner Penalty type"] = "scored"
world_cup.loc[[45, 48, 94, 95, 107, 117], "Winner Penalty type"] = "scored"
world_cup.loc[105, "Winner Penalty type"] = "missed"

In [24]:
world_cup = world_cup.drop(["Winning team Taker", "Losing team Taker", "Date"], axis=1)
euros = euros.drop(["Winning team Taker", "Losing team Taker", "Date"], axis=1)

In [25]:
world_cup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   No.                  146 non-null    int64 
 1   Penalty Number       146 non-null    int64 
 2   Event Year           146 non-null    int64 
 3   Winner               146 non-null    object
 4   Full Time Score      146 non-null    object
 5   Loser                146 non-null    object
 6   Winning Team GK      146 non-null    object
 7   Losing Team GK       146 non-null    object
 8   Round                146 non-null    object
 9   Winner Penalty type  141 non-null    object
 10  Loser Penalty type   138 non-null    object
 11  Winning team Kicker  146 non-null    object
 12  Losing team Kicker   146 non-null    object
dtypes: int64(3), object(10)
memory usage: 15.0+ KB


In [26]:
euros.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   No.                  120 non-null    int64 
 1   Penalty Number       120 non-null    int64 
 2   Event Year           120 non-null    int64 
 3   Winner               120 non-null    object
 4   Full Time Score      120 non-null    object
 5   Loser                120 non-null    object
 6   Winning team GK      120 non-null    object
 7   Losing team GK       120 non-null    object
 8   Round                120 non-null    object
 9   Winner Penalty type  119 non-null    object
 10  Loser Penalty type   113 non-null    object
 11  Winning team Kicker  120 non-null    object
 12  Losing team Kicker   120 non-null    object
dtypes: int64(3), object(10)
memory usage: 12.3+ KB


In [27]:
world_cup

Unnamed: 0,No.,Penalty Number,Event Year,Winner,Full Time Score,Loser,Winning Team GK,Losing Team GK,Round,Winner Penalty type,Loser Penalty type,Winning team Kicker,Losing team Kicker
0,1,1,1982,Germany,3–3,France,Schumacher,Ettori,Semi-finals,scored,scored,Kaltz,Giresse
1,1,2,1982,Germany,3–3,France,Schumacher,Ettori,Semi-finals,scored,scored,Breitner,Amoros
2,1,3,1982,Germany,3–3,France,Schumacher,Ettori,Semi-finals,missed,scored,Stielike,Rocheteau
3,1,4,1982,Germany,3–3,France,Schumacher,Ettori,Semi-finals,scored,missed,Littbarski,Six
4,1,5,1982,Germany,3–3,France,Schumacher,Ettori,Semi-finals,scored,scored,Rummenigge,Platini
...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,30,1,2018,Croatia,2–2,Russia,Subašić,Akinfeev,Quarter-finals,scored,missed,Brozović,Smolov
142,30,2,2018,Croatia,2–2,Russia,Subašić,Akinfeev,Quarter-finals,missed,scored,Kovačić,Dzagoev
143,30,3,2018,Croatia,2–2,Russia,Subašić,Akinfeev,Quarter-finals,scored,missed,Modrić,Fernandes
144,30,4,2018,Croatia,2–2,Russia,Subašić,Akinfeev,Quarter-finals,scored,scored,Vida,Ignashevich


In [28]:
win_pct = output["Win %"].copy()
score_pct = output["Score %"].copy()

In [29]:
[{"name": i, "id": i} for i in world_cup.columns]

[{'name': 'No.', 'id': 'No.'},
 {'name': 'Penalty Number ', 'id': 'Penalty Number '},
 {'name': 'Event Year', 'id': 'Event Year'},
 {'name': 'Winner', 'id': 'Winner'},
 {'name': 'Full Time Score', 'id': 'Full Time Score'},
 {'name': 'Loser', 'id': 'Loser'},
 {'name': 'Winning Team GK', 'id': 'Winning Team GK'},
 {'name': 'Losing Team GK', 'id': 'Losing Team GK'},
 {'name': 'Round', 'id': 'Round'},
 {'name': 'Winner Penalty type', 'id': 'Winner Penalty type'},
 {'name': 'Loser Penalty type', 'id': 'Loser Penalty type'},
 {'name': 'Winning team Kicker', 'id': 'Winning team Kicker'},
 {'name': 'Losing team Kicker', 'id': 'Losing team Kicker'}]

In [51]:
world_cup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   No.                  146 non-null    int64 
 1   Penalty Number       146 non-null    int64 
 2   Event Year           146 non-null    int64 
 3   Winner               146 non-null    object
 4   Full Time Score      146 non-null    object
 5   Loser                146 non-null    object
 6   Winning Team GK      146 non-null    object
 7   Losing Team GK       146 non-null    object
 8   Round                146 non-null    object
 9   Winner Penalty type  141 non-null    object
 10  Loser Penalty type   138 non-null    object
 11  Winning team Kicker  146 non-null    object
 12  Losing team Kicker   146 non-null    object
dtypes: int64(3), object(10)
memory usage: 15.0+ KB


In [54]:
app = Dash(__name__)
app.layout = dash_table.DataTable(style_data={"whiteSpace": "normal", "height": "auto"},
                                  data=world_cup.to_dict("records"), columns=[{"name": i, "id": i} for i in world_cup.columns],
                                  fixed_rows={"headers": True},
                                  style_cell={"minWidth": 160, "maxWidth": 160, "width": 160, "textAlign": "left"},
                                  style_as_list_view=True
                                  )

In [55]:
if __name__ == "__main__":
    app.run_server(debug="True", use_reloader=False)

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on


In [52]:
world_cup

Unnamed: 0,No.,Penalty Number,Event Year,Winner,Full Time Score,Loser,Winning Team GK,Losing Team GK,Round,Winner Penalty type,Loser Penalty type,Winning team Kicker,Losing team Kicker
0,1,1,1982,Germany,3–3,France,Schumacher,Ettori,Semi-finals,scored,scored,Kaltz,Giresse
1,1,2,1982,Germany,3–3,France,Schumacher,Ettori,Semi-finals,scored,scored,Breitner,Amoros
2,1,3,1982,Germany,3–3,France,Schumacher,Ettori,Semi-finals,missed,scored,Stielike,Rocheteau
3,1,4,1982,Germany,3–3,France,Schumacher,Ettori,Semi-finals,scored,missed,Littbarski,Six
4,1,5,1982,Germany,3–3,France,Schumacher,Ettori,Semi-finals,scored,scored,Rummenigge,Platini
...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,30,1,2018,Croatia,2–2,Russia,Subašić,Akinfeev,Quarter-finals,scored,missed,Brozović,Smolov
142,30,2,2018,Croatia,2–2,Russia,Subašić,Akinfeev,Quarter-finals,missed,scored,Kovačić,Dzagoev
143,30,3,2018,Croatia,2–2,Russia,Subašić,Akinfeev,Quarter-finals,scored,missed,Modrić,Fernandes
144,30,4,2018,Croatia,2–2,Russia,Subašić,Akinfeev,Quarter-finals,scored,scored,Vida,Ignashevich
