In [37]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import altair as alt

In [38]:
countries_url = 'https://en.wikipedia.org/wiki/2024_Summer_Olympics'

In [39]:
response = requests.get(countries_url)
soup = BeautifulSoup(response.text, 'html.parser')

In [40]:
# Find all tables on the page
tables = soup.find_all('table', {'class': 'wikitable'})

In [41]:
# Filter for the table with the most matching classes
for i, table in enumerate(tables):
    if 'collapsible' in table.get('class', []) and 'collapsed' in table.get('class', []) and 'sortable' in table.get('class', []):
        target_table = table
        break
else:
    print("No matching table found.")
    target_table = None

# Read the table into a pandas dataframe if found
if target_table:
    olympic_teams = pd.read_html(str(target_table), flavor='html5lib')[0]
    print(olympic_teams)
else:
    print("No tables with the specified classes found.")

     Ranking            NOC  Athletes
0          1  United States       592
1          2         France       573
2          3      Australia       460
3          4        Germany       428
4          5          Japan       403
..       ...            ...       ...
201      196         Tuvalu         2
202      203         Belize         1
203      203  Liechtenstein         1
204      203          Nauru         1
205      203        Somalia         1

[206 rows x 3 columns]


In [42]:
incentives_url= 'https://en.wikipedia.org/wiki/Incentives_for_Olympic_medalists_by_country'

In [43]:
# Send a request to fetch the HTML content of the page
response = requests.get(incentives_url)
soup = BeautifulSoup(response.text, 'html.parser')

In [44]:
# Find the first table on the page
table = soup.find('table', {'class': 'wikitable'})

# Check if the table is found and then read it into a pandas dataframe
if table:
    incentives_df = pd.read_html(str(table), flavor='html5lib')[0]
    incentives_df
else:
    print("No table found on the page.")

In [45]:
# align the country column names
olympic_teams.rename(columns={'NOC': 'Country'}, inplace=True)
incentives_df.rename(columns={'Countries': 'Country'}, inplace=True)

In [46]:
olympic_teams = olympic_teams[['Country', 'Athletes']]
incentives_df = incentives_df[['Country', 'Gold', 'Silver', 'Bronze']]

In [47]:
# merge the datasets
olympics = pd.merge(olympic_teams, incentives_df, on='Country', how='left')
olympics

Unnamed: 0,Country,Athletes,Gold,Silver,Bronze
0,United States,592,"$37,500","$22,500","$15,000"
1,France,573,"$86,670","$43,335","$21,667"
2,Australia,460,"$20,000","$15,000","$10,000"
3,Germany,428,"$22,000","$17,000","$11,000"
4,Japan,403,"$45,000","$18,000","$9,000"
...,...,...,...,...,...
201,Tuvalu,2,,,
202,Belize,1,,,
203,Liechtenstein,1,"$27,602","$22,082","$16,561"
204,Nauru,1,,,


In [48]:
olympics.head(15)

Unnamed: 0,Country,Athletes,Gold,Silver,Bronze
0,United States,592,"$37,500","$22,500","$15,000"
1,France,573,"$86,670","$43,335","$21,667"
2,Australia,460,"$20,000","$15,000","$10,000"
3,Germany,428,"$22,000","$17,000","$11,000"
4,Japan,403,"$45,000","$18,000","$9,000"
5,China,388,,,
6,Spain,383,"$111,562","$56,968","$35,605"
7,Italy,371,"$213,000","$107,000","$71,000"
8,Great Britain,327,$0,$0,$0
9,Canada,315,"$16,000","$12,000","$8,000"


In [49]:
countries_order = ['United States', 'France', 'Australia', 'Germany', 'Japan', 'China', 'Spain', 'Italy', 'Great Britain', 'Canada', 'Brazil']

In [51]:
olympics_top10 = olympics.head(10)
#olympics_top10.to_csv('olympics_top10.csv', index=False)

In [52]:
# Convert the 'Gold', 'Silver', and 'Bronze' columns to numeric by removing the dollar signs and commas
olympics_top10['Gold'] = olympics_top10['Gold'].replace('[\$,]', '', regex=True).astype(float)
olympics_top10['Silver'] = olympics_top10['Silver'].replace('[\$,]', '', regex=True).astype(float)
olympics_top10['Bronze'] = olympics_top10['Bronze'].replace('[\$,]', '', regex=True).astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  olympics_top10['Gold'] = olympics_top10['Gold'].replace('[\$,]', '', regex=True).astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  olympics_top10['Silver'] = olympics_top10['Silver'].replace('[\$,]', '', regex=True).astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  olympics_to

In [53]:
# Melt the DataFrame to long format
olympics_top10_long = olympics_top10.melt(id_vars=['Country'], value_vars=['Gold', 'Silver', 'Bronze'], 
                                          var_name='Medal', value_name='Compensation')

In [54]:
# Define the colors for the medals
medal_colors = {
    'Gold': '#D6AF36',
    'Silver': '#D7D7D7',
    'Bronze': '#A77044'
}

In [55]:
olympics_top10_long

Unnamed: 0,Country,Medal,Compensation
0,United States,Gold,37500.0
1,France,Gold,86670.0
2,Australia,Gold,20000.0
3,Germany,Gold,22000.0
4,Japan,Gold,45000.0
5,China,Gold,
6,Spain,Gold,111562.0
7,Italy,Gold,213000.0
8,Great Britain,Gold,0.0
9,Canada,Gold,16000.0


In [56]:
countries_order = ['United States', 'France']

In [57]:
# themes

pallete = {
    "shadow": "rgba(24, 42, 56, 0.1)",
    "nominal_1": "#179fdb",
    "nominal_2": "#e6224b",
    "nominal_3": "#f4c245",
    "nominal_4": "#122b39",
    "nominal_5": "#eb5c2e",
    "nominal_6": "#36b7b4",
    "Other_3":"#d6d4d4",
    #"Deemphasize_Discrete": "rgb(173, 195, 215)",
    "Deemphasize_Discrete": "rgba(24, 42, 56)",
    "Deemphasize_Other": "rgba(93, 115, 102, 1)",
    "Deemphasize_Continuous": "rgba(24, 42, 56, 0.2)",
    "accent": "#179fdb",
    "domain": "#122b39",
    "bar": {
        "accent_1": "#122b39",
        "other" : "rgb(168, 192, 222)",
        "accent_2" : "#eb5c2e"
    }
}

pallete["GBR"] = pallete["nominal_1"]
pallete["USA"] = pallete["nominal_2"]
pallete["DEU"] = pallete["nominal_3"]
pallete["FRA"] = pallete["nominal_4"]
pallete["Other_1"] = pallete["nominal_5"]
pallete["Other_2"] = pallete["nominal_6"]

pallete["UK"] = pallete["GBR"]
pallete["United Kingdom"] = pallete["GBR"]
pallete["United States"] = pallete["USA"]
pallete["Germany"] = pallete["DEU"]
pallete["France"] = pallete["FRA"]

def report():
  return {"config": {
    "font": "Circular Std",
    "mark" : {
      "line": {
        "interpolate": "linear",
      }
    },
    "view" : {
      "stroke": "transparent",
      "width": 400,
      "height": 300
    },
    "range": {
      "category" : [
        pallete["nominal_1"],
        pallete["nominal_2"],
        pallete["nominal_3"],
        pallete["nominal_4"],
        pallete["nominal_5"],
        pallete["nominal_6"],
      ],
      "diverging" : ["#E6224B","#E54753","#C9C9C9","#179FDB","#122B39"],
      "heatmap" : ["#C9C9C9","#179FDB","#0063AF","#122B39"],
      "ordinal" : ["#00A767","#36B7B4","#179FDB","#0063AF","#243B5A"]
    },
    "axisX": {
      "labelColor": pallete["domain"],
      "tickColor": pallete["domain"],
      "domainColor": pallete["domain"],
      "domainOpacity": 0.5,
      "gridOpacity": 0,
      "labelFont": "Circular Std",
      "labelAngle": 0,
      "labelAlign": "center",
      "labelFontSize": 11,
      "labelPadding": 5,
      "tickCount": 10,
      "tickSize": 0,
      "title": ""
    },
    "axisY": {
      "labelColor": pallete["domain"],
      "labelFont": "Circular Std",
      "tickColor": pallete["domain"],
      "domainColor": pallete["domain"],
      "gridColor": pallete["domain"],
      "gridDash": [
        1,
        5
      ],
      "gridOpacity": 0.5,
      "labelPadding": 5,
      "labelFontSize": 11,
      "domainOpacity": 0.5,
      "tickSize": 0,
      "title": None,
      "titleAlign": "left",
      "titleAngle": 0,
      "titleBaseline": "bottom",
      "titleColor": pallete["domain"],
      "titleOpacity": 0.9,
      "titleX": 0,
      "titleY": -7
    }
  }}

alt.themes.register('report', report)        

def light():
  return {"config": {
    "font": "Circular Std",
    "mark" : {
      "line": {
        "interpolate": "monotone",
      }
    },
    "view" : {
      "stroke": "transparent",
      "width": 400,
      "height": 300
    },
    "range": {
      "category" : ["#36B7B4","#E6224B","#F4C245","#0063AF","#00A767","#179FDB","#EB5C2E"],
      "diverging" : ["#E6224B","#E54753","#C9C9C9","#179FDB","#122B39"],
      "heatmap" : ["#C9C9C9","#179FDB","#0063AF","#122B39"],
      "ordinal" : ["#00A767","#36B7B4","#179FDB","#0063AF","#243B5A"]
    },
    "axisX": {
      "domainColor": "#676A86",
      "domainOpacity": 0.5,
      "grid": False,
      "labelAngle": 0,
      "labelColor": "#676A86",
      "labelOpacity": 0.7,
      "orient": "bottom",
      "tickColor": "#676A86",
      "tickCount": 10,
      "tickOpacity": 0.5,
      "title" : False,
      "titleAlign": "center",
      "titleAnchor": "middle",
      "titleColor": "#676A86",
      "titleFontSize": 12,
      "titleOpacity": 0.8,
      "titleY": -15
    },
    "axisY": {
      "domainColor": "#676A86",
      "domainOpacity": 0.5,
      "gridColor": "#676A86",
      "gridDash": [1, 5],
      "gridOpacity": 0.5,
      "labelColor": "#676A86",
      "labelOpacity": 0.7,
      "labelPadding": 5,
      "tickColor": "#676A86",
      "tickCount": 8,
      "tickOpacity": 0.5,
      "ticks": False,
      "titleAlign": "left",
      "titleAngle": 0,
      "titleBaseline": "bottom",
      "titleColor": "#676A86",
      "titleFontSize": 12,
      "titleOpacity": 0.8,
      "titleX": 0,
      "titleY": -7
    }
  }}

alt.themes.register('light', light)

def dark():
    return {"config": {
        "background": "#122B39",
        "font": "Circular Std",
        "title": {
            "color": "#b4c8d8",
            "fontSize": 14,
            "fontWeight": 400,
        },
        "mark": {
            "line": {
                "interpolate": "monotone",
            }
        },
        "view": {
            "stroke": "transparent",
            "width": 400,
            "height": 300
        },
        "range": {
            "category": ["#36B7B4", "#E6224B", "#F4C245", "#0063AF", "#00A767", "#179FDB", "#EB5C2E"],
            "diverging": ["#E6224B", "#E54753", "#C9C9C9", "#179FDB", "#122B39"],
            "heatmap": ["#C9C9C9", "#179FDB", "#0063AF", "#122B39"],
            "ordinal": ["#00A767", "#36B7B4", "#179FDB", "#0063AF", "#243B5A"]
        },
        "axisX": {
            "domainColor": "#b4c8d8",
            "domainOpacity": 0.5,
            "grid": False,
            "labelAngle": 0,
            "labelColor": "#b4c8d8",
                "labelOpacity": 0.7,
                "orient": "bottom",
                "tickColor": "#b4c8d8",
                "tickCount": 10,
                "tickOpacity": 0.5,
                "title": "",
                "titleAlign": "center",
                "titleAnchor": "middle",
                "titleColor": "#b4c8d8",
                "titleFontSize": 12,
                "titleOpacity": 0.8,
                "titleX": 207,
                "titleY": -15
        },
        "axisY": {
            "domainColor": "#b4c8d8",
            "domainOpacity": 0.5,
            "format": ".0f",
            "gridColor": "#b4c8d8",
            "gridDash": [
                1,
                5
            ],
            "gridOpacity": 0.5,
            "labelColor": "#b4c8d8",
            "labelOpacity": 0.7,
            "labelPadding": 5,
            "tickColor": "#b4c8d8",
            "tickCount": 8,
            "tickOpacity": 0.5,
            "ticks": False,
            "title": "FAO price index",
            "titleAlign": "left",
            "titleAngle": 0,
            "titleBaseline": "bottom",
            "titleColor": "#b4c8d8",
            "titleFontSize": 12,
            "titleOpacity": 0.8,
            "titleX": 0,
            "titleY": -7

        }
    }}


alt.themes.register('dark', dark)

<function __main__.dark()>

In [58]:
alt.themes.enable('light')

medal_comp_chart = alt.Chart(olympics_top10_long, 
                             title = alt.Title('Medal Compensation',
                                               subtitle = ['Top 10 countries by 2024 Olympic roster size','Compensation in 2021 USD'],
                                               anchor = 'start',
                                               offset = 15)
                             ).mark_bar().encode(
    x = alt.X('Country',
              title='Country',
              sort=countries_order,
              axis=alt.Axis(title=None,
                            labelAngle=-45)),
    xOffset = alt.X('Medal', 
                    sort=alt.EncodingSortField(field='Compensation')),
    y = alt.Y('Compensation', 
              title=None,
              axis=alt.Axis(
                        format='$,.0f',
                        titleAngle=90,
                        orient='left'
        )),
    color = alt.Color('Medal',
                      scale=alt.Scale(domain=list(medal_colors.keys()),
                                      range=list(medal_colors.values())),
                                      legend=alt.Legend(orient='top',
                                                        direction='horizontal',
                                                        title=None)),
    tooltip = [
        alt.Tooltip('Country', title='Country'),
        alt.Tooltip('Medal', title='Medal'),
        alt.Tooltip('Compensation', title='Compensation', format='$,.0f')
    ]
                                      ).properties(width=600)

medal_comp_chart


In [59]:
# output json and png
medal_comp_chart.save('olympic_medal_comp_fig1.json')
medal_comp_chart.save('olympic_medal_comp_fig1.png')

In [60]:
olympics

Unnamed: 0,Country,Athletes,Gold,Silver,Bronze
0,United States,592,"$37,500","$22,500","$15,000"
1,France,573,"$86,670","$43,335","$21,667"
2,Australia,460,"$20,000","$15,000","$10,000"
3,Germany,428,"$22,000","$17,000","$11,000"
4,Japan,403,"$45,000","$18,000","$9,000"
...,...,...,...,...,...
201,Tuvalu,2,,,
202,Belize,1,,,
203,Liechtenstein,1,"$27,602","$22,082","$16,561"
204,Nauru,1,,,


In [61]:
olympics.dtypes

Country     object
Athletes     int64
Gold        object
Silver      object
Bronze      object
dtype: object

In [62]:
olympics['Gold'] = olympics['Gold'].replace('[\$,]', '', regex=True).astype(float)
olympics['Silver'] = olympics['Silver'].replace('[\$,]', '', regex=True).astype(float)
olympics['Bronze'] = olympics['Bronze'].replace('[\$,]', '', regex=True).astype(float)
olympics

Unnamed: 0,Country,Athletes,Gold,Silver,Bronze
0,United States,592,37500.0,22500.0,15000.0
1,France,573,86670.0,43335.0,21667.0
2,Australia,460,20000.0,15000.0,10000.0
3,Germany,428,22000.0,17000.0,11000.0
4,Japan,403,45000.0,18000.0,9000.0
...,...,...,...,...,...
201,Tuvalu,2,,,
202,Belize,1,,,
203,Liechtenstein,1,27602.0,22082.0,16561.0
204,Nauru,1,,,


In [63]:
# sort by gold
olympics_topcomp = olympics.sort_values(by='Gold', ascending=False).head(10)
olympics_topcomp

Unnamed: 0,Country,Athletes,Gold,Silver,Bronze
59,Hong Kong,36,769558.0,384279.0,192139.0
82,Singapore,23,737000.0,369000.0,184000.0
44,Chinese Taipei,60,720000.0,251000.0,178000.0
50,Thailand,51,365150.0,219090.0,146060.0
65,Indonesia,29,346000.0,138500.0,69250.0
38,Kazakhstan,79,250000.0,150000.0,75000.0
72,Malaysia,26,236000.0,71000.0,24000.0
53,Azerbaijan,48,235000.0,117500.0,58750.0
45,Morocco,60,225067.0,140667.0,84400.0
7,Italy,371,213000.0,107000.0,71000.0


In [64]:
# remane all values called 'Chinese Taipei' to 'Taiwan'
olympics_topcomp['Country'] = olympics_topcomp['Country'].replace('Chinese Taipei', 'Taiwan')

In [65]:
comp_order = ['Hong Kong', 'Singapore','Taiwan', 'Thailand', 'Indonesia', 'Kazakhstan','Malaysia', 'Azerbaijan', 'Morocco', 'Italy']

In [66]:
olympics_topcomp_long = olympics_topcomp.melt(id_vars=['Country'], value_vars=['Gold', 'Silver', 'Bronze'], 
                                          var_name='Medal', value_name='Compensation')

In [67]:
alt.themes.enable('light')

top_comp_chart = alt.Chart(olympics_topcomp_long, 
                             title = alt.Title('Medal Compensation',
                                               subtitle = ['Top 10 countries by Gold Medal Compensation','Compensation in 2021 USD'],
                                               anchor = 'start',
                                               offset = 15)
                             ).mark_bar().encode(
    x = alt.X('Country',
              title='Country',
              sort=comp_order,
              axis=alt.Axis(title=None,
                            labelAngle=-45)),
    xOffset = alt.X('Medal', 
                    sort=alt.EncodingSortField(field='Compensation')),
    y = alt.Y('Compensation', 
              title=None,
              axis=alt.Axis(
                        format='$,.0f',
                        titleAngle=90,
                        orient='left'
        )),
    color = alt.Color('Medal',
                      scale=alt.Scale(domain=list(medal_colors.keys()),
                                      range=list(medal_colors.values())),
                                      legend=alt.Legend(orient='top',
                                                        direction='horizontal',
                                                        title=None)),
    tooltip = [
        alt.Tooltip('Country', title='Country'),
        alt.Tooltip('Medal', title='Medal'),
        alt.Tooltip('Compensation', title='Compensation', format='$,.0f')
    ]
                                      ).properties(width=600)

top_comp_chart

In [68]:
# output json and png
top_comp_chart.save('olympic_medal_comp_fig2.json')
top_comp_chart.save('olympic_medal_comp_fig2.png')