In [1]:
import sqlite3
import pandas as pd
import numpy as np

In [2]:
with sqlite3.connect('../data/database.sqlite') as con:

    countries = pd.read_sql_query("SELECT * from Country", con)
    matches = pd.read_sql_query("SELECT * from Match", con)
    leagues = pd.read_sql_query("SELECT * from League", con)
    teams = pd.read_sql_query("SELECT * from Team", con)

In [3]:
matches = matches.set_index('id',drop = True)

In [4]:
matches = matches.iloc[:,:10].copy()

In [5]:
matches.head()

Unnamed: 0_level_0,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,1
2,1,1,2008/2009,1,2008-08-16 00:00:00,492474,10000,9994,0,0
3,1,1,2008/2009,1,2008-08-16 00:00:00,492475,9984,8635,0,3
4,1,1,2008/2009,1,2008-08-17 00:00:00,492476,9991,9998,5,0
5,1,1,2008/2009,1,2008-08-16 00:00:00,492477,7947,9985,1,3


## define a field for the type of outcome

In [6]:
conditions = [matches['home_team_goal'] > matches['away_team_goal']
             ,matches['home_team_goal'] < matches['away_team_goal']]
choices = ['hw','aw']

In [7]:
matches['outcome'] = np.select(conditions, choices, default='draw')

In [69]:
population = matches.outcome.value_counts(normalize = True)
population

hw      0.458717
aw      0.287386
draw    0.253897
Name: outcome, dtype: float64

## can we reject the null hypotheses for any of the leagues?

In [65]:
from scipy.stats import chisquare

In [83]:
leagues = matches.league_id.value_counts().index.values
for l in leagues:
    df = matches[matches['league_id'] == l]
    sample = df.outcome.value_counts(normalize = False)
    obs = np.array(sample)
    exp = np.array(population) * sample.sum()
    c = countries[countries['id'] == l]
    print(c,sample.sum())
    print(chisquare(f_obs = obs,f_exp = exp))

      id   name
9  21518  Spain 3040
Power_divergenceResult(statistic=12.424892849039185, pvalue=0.0020043280240081272)
     id    name
2  4769  France 3040
Power_divergenceResult(statistic=4.4081315135519237, pvalue=0.11035357423958711)
     id     name
1  1729  England 3040
Power_divergenceResult(statistic=0.22630932676639975, pvalue=0.89301253309508621)
      id   name
4  10257  Italy 3017
Power_divergenceResult(statistic=4.8032902233189549, pvalue=0.090568834818630733)
      id         name
5  13274  Netherlands 2448
Power_divergenceResult(statistic=4.7817556311793599, pvalue=0.091549285205286338)
     id     name
3  7809  Germany 2448
Power_divergenceResult(statistic=3.5242717574016922, pvalue=0.17167778901600084)
      id      name
7  17642  Portugal 2052
Power_divergenceResult(statistic=2.221800993732157, pvalue=0.32926232784224591)
      id    name
6  15722  Poland 1920
Power_divergenceResult(statistic=4.3180425077125486, pvalue=0.11543805031108348)
      id      name
8  19694 