## Loading Data

In [1]:
import utils
import pandas as pd
import altair as alt

# WARNING: This will lead to extremely large notebooks as DataFrames size grows
alt.data_transformers.disable_max_rows()
# However this piece of code will reduce that
# alt.data_transformers.enable('json')

DataTransformerRegistry.enable('default')

In [2]:
players = utils.load_df('players.feather')
player_attrs = utils.load_df('player_attrs.feather')
teams = utils.load_df('teams.feather')
team_attrs = utils.load_df('team_attrs.feather')
countries = utils.load_df('countries.feather')
leagues = utils.load_df('leagues.feather')
matches = utils.load_df('matches.feather')

In [3]:
players.head()

Unnamed: 0,id,player_api_id,player_name,birthday,height,weight
0,1,505942,Aaron Appindangoye,1992-02-29 00:00:00,182.88,187
1,2,155782,Aaron Cresswell,1989-12-15 00:00:00,170.18,146
2,3,162549,Aaron Doran,1991-05-13 00:00:00,170.18,163
3,4,30572,Aaron Galindo,1982-05-08 00:00:00,182.88,198
4,5,23780,Aaron Hughes,1979-11-08 00:00:00,182.88,154


In [4]:
player_attrs.head()


Unnamed: 0,id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,44.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,43.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,43.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0


## Top X Players By Rating

In [5]:
X = 10
player_avg_ratings = player_attrs.groupby('player_api_id')[['overall_rating', 'potential']].mean()
sorted_player_avg_ratings = player_avg_ratings.sort_values('overall_rating', ascending=False)
top_players_attrs = sorted_player_avg_ratings[:X]
worst_players_attrs = sorted_player_avg_ratings[-X:]
top_players = top_players_attrs.merge(players, on='player_api_id')
worst_players = worst_players_attrs.merge(players, on='player_api_id')

# top_players
chart = alt.Chart(
    top_players,
    # width=500,
    title=f'Top {X} Players By Rating' 
).mark_bar().encode(
    y=alt.Y('player_name',sort='-x'),
    x=alt.X('overall_rating'),
) 
chart |= alt.Chart(
    top_players,
    # width=500,
    title=f'Top {X} Players By Potential'
).mark_bar().encode(
    y=alt.Y('player_name',sort='-x'),
    x=alt.X('potential'),
)

chart &= alt.Chart(
    worst_players,
    # width=500,
    title=f'Worst {X} Players By Potential'
).mark_bar(color='firebrick').encode(
    y=alt.Y('player_name',sort='-x',),
    x=alt.X('potential'),
)

chart

## Heatmap

Unnamed: 0,id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,44.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,43.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,43.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183973,183974,39902,2009-08-30 00:00:00,83.0,85.0,right,medium,low,84.0,77.0,...,88.0,83.0,22.0,31.0,30.0,9.0,20.0,84.0,20.0,20.0
183974,183975,39902,2009-02-22 00:00:00,78.0,80.0,right,medium,low,74.0,76.0,...,88.0,70.0,32.0,31.0,30.0,9.0,20.0,73.0,20.0,20.0
183975,183976,39902,2008-08-30 00:00:00,77.0,80.0,right,medium,low,74.0,71.0,...,88.0,70.0,32.0,31.0,30.0,9.0,20.0,73.0,20.0,20.0
183976,183977,39902,2007-08-30 00:00:00,78.0,81.0,right,medium,low,74.0,64.0,...,88.0,53.0,28.0,32.0,30.0,9.0,20.0,73.0,20.0,20.0


In [39]:
def alt_corr_plot(df: pd.DataFrame, *, box_size=100, annot_size=30) -> alt.Chart:
    corrMatrix = df.corr()\
    .reset_index()\
    .melt('index')
    corrMatrix.columns = ['var1', 'var2', 'correlation']
    
    base = alt.Chart(corrMatrix).transform_filter(
        alt.datum.var1 < alt.datum.var2
    ).encode(
        x=alt.X('var1',title=''),
        y=alt.Y('var2',title=''),
    ).properties(
        width=alt.Step(box_size),
        height=alt.Step(box_size)
    )

    rects = base.mark_rect().encode(
        color='correlation'
    )

    text = base.mark_text(
        size=annot_size
    ).encode(
        text=alt.Text('correlation', format=".2f"),
        color=alt.condition(
            "datum.correlation > 0.5",
            alt.value('white'),
            alt.value('black')
        )
    )

    return rects + text

alt_corr_plot(player_attrs, box_size=50, annot_size=10)