In [1]:
#import all important libraries 
import numpy as np 
import pandas as pd 
from datetime import datetime
from scipy import stats

In [2]:
#import main dataset containing all match data
data = pd.read_csv(r"C:\Users\joram\Documents\all_match_results.csv")

In [3]:
#look at dataset 
data.head()

Unnamed: 0,Date,HomeTeam,Result,AwayTeam
0,13-Aug-2021,Brentford,2:0,Arsenal
1,14-Aug-2021,Burnley,1:2,Brighton and Hove Albion
2,14-Aug-2021,Chelsea,3:0,Crystal Palace
3,14-Aug-2021,Everton,3:1,Southampton
4,14-Aug-2021,Leicester City,1:0,Wolverhampton Wanderers


In [4]:
#transform dates into datetime objects
for i in range(len(data['Date'])):
    data["Date"][i] = datetime.strptime(data["Date"][i], "%d-%b-%Y").date()

In [5]:
#look at new dataset 
data.head()

Unnamed: 0,Date,HomeTeam,Result,AwayTeam
0,2021-08-13,Brentford,2:0,Arsenal
1,2021-08-14,Burnley,1:2,Brighton and Hove Albion
2,2021-08-14,Chelsea,3:0,Crystal Palace
3,2021-08-14,Everton,3:1,Southampton
4,2021-08-14,Leicester City,1:0,Wolverhampton Wanderers


In [6]:
#split results for comparisons
for i in range(len(data['Result'])):
    data["Result"][i] = data["Result"][i].split(':')

#look at new dataset 
data.head()

Unnamed: 0,Date,HomeTeam,Result,AwayTeam
0,2021-08-13,Brentford,"[2, 0]",Arsenal
1,2021-08-14,Burnley,"[1, 2]",Brighton and Hove Albion
2,2021-08-14,Chelsea,"[3, 0]",Crystal Palace
3,2021-08-14,Everton,"[3, 1]",Southampton
4,2021-08-14,Leicester City,"[1, 0]",Wolverhampton Wanderers


In [7]:
#add an outcome comlume that indicates who won the game:
#1 stands for home team won
#2 stands for away team won
#0 stands for a draw
outcome = []
for i in range(len(data['Result'])):
    if int(data["Result"][i][0]) > int(data["Result"][i][1]):
        outcome.append(1)
    elif int(data["Result"][i][0]) < int(data["Result"][i][1]):
        outcome.append(2)
    else:
        outcome.append(0)
        
data["outcome"] = outcome

In [8]:
#creating the coach data. Each row shows the team and when they hired the new coach 
#lines that are commented are not taken into consideratoins because one of the coaches did not play enough games

coachData = [
    #['Burnley', datetime(2022, 4, 15)],
    ['Leeds United', datetime(2022, 2, 27)],
    ['Everton', datetime(2022, 1, 30)],
    ['Watford', datetime(2022, 1, 24)],
    #['Everton', datetime(2022, 1, 16)],
    ['Manchester United', datetime(2021, 12, 3)],
    #['Man Utd', datetime(2021, 11, 21)],
    ['Aston Villa', datetime(2021, 11, 7)],
    ['Newcastle United', datetime(2021, 11, 7)],
    ['Norwich City', datetime(2021, 11, 6)],
    ['Tottenham Hotspur', datetime(2021, 11, 1)],
    #['Newcastle', datetime(2021, 10, 20)],
    ['Watford', datetime(2021, 10, 3)]
]

In [9]:
def getClubData(club, date, data):
    """
    This function takes a club, a date and dataset as input and 
    outputs the list of results before and after the new coach was appointed.
    """
    
    #list to store the results before the coach change
    resultsbef = []
    #list to store the results after the coach change
    resultsaft = []
    
    #go over all outcomes
    for i in range(len(data['outcome'])):
        #check if game was with old or new coach
        if data['Date'][i] <= date:
            #check if the team participated as home team
            if data['HomeTeam'][i] == club:
                #check who one the game and add the number of points the team won to the list 
                if data["outcome"][i] == 1:
                    resultsbef.append(3)
                elif data["outcome"][i] == 0:
                    resultsbef.append(1)
                else:
                    resultsbef.append(0)
                    
            #check if the team participated as away team
            elif data['AwayTeam'][i] == club:
                #check who one the game and add the number of points the team won to the list 
                if data["outcome"][i] == 1:
                    resultsbef.append(0)
                elif data["outcome"][i] == 0:
                    resultsbef.append(1)
                else:
                    resultsbef.append(3)
        else:
            #check if the team participated as home team
            if data['HomeTeam'][i] == club:
                #check who one the game and add the number of points the team won to the list 
                if data["outcome"][i] == 1:
                    resultsaft.append(3)
                elif data["outcome"][i] == 0:
                    resultsaft.append(1)
                else:
                    resultsaft.append(0)
            #check if the team participated as away team
            elif data['AwayTeam'][i] == club:
                #check who one the game and add the number of points the team won to the list 
                if data["outcome"][i] == 1:
                    resultsaft.append(0)
                elif data["outcome"][i] == 0:
                    resultsaft.append(1)
                else:
                    resultsaft.append(3)
    #return the results 
    return resultsbef, resultsaft

In [10]:
#this list stores the average points of the sample for differnt teams and numner of games before and after a coach change
averagePointsSample = []

#go through all coaching changes
for i in coachData:
    
    #store the old results in a and the new results in b using our function created above
    a, b = getClubData(i[0], i[1].date(), data)
    
    #for each sample add a new row to the list
    averagePointsSample.append([])
    
    #append the average points per game before the coach change
    averagePointsSample[-1].append(sum(a)/len(a))
    
    #append the average points per game for 1 game before the coach change if possible and -1 otherwise
    if len([a[-1]]) == 1:
        averagePointsSample[-1].append(a[-1])
    else:
        averagePointsSample[-1].append(-1)
        
    #append the average points per game for 3 game before the coach change if possible and -1 otherwise
    if len(a[-3:]) == 3:
        averagePointsSample[-1].append(sum(a[-3:])/3)
    else:
        averagePointsSample[-1].append(-1)
    
    #append the average points per game for 5 game before the coach change if possible and -1 otherwise
    if len(a[-5:]) == 5:
        averagePointsSample[-1].append(sum(a[-5:])/5)
    else:
        averagePointsSample[-1].append(-1)
        
    #append the average points per game for 10 game before the coach change if possible and -1 otherwise
    if len(a[-10:]) == 10:
        averagePointsSample[-1].append(sum(a[-10:])/10)
    else:
        averagePointsSample[-1].append(-1)

    #append the average points per game after the coach change
    averagePointsSample[-1].append((sum(b)/len(b)))
    
    #append the average points per game for 1 game after the coach change if possible and -1 otherwise
    if len(b[:1]) == 1:
        averagePointsSample[-1].append(sum(b[:1]))
    else:
        averagePointsSample[-1].append(-1)
        
    #append the average points per game for 3 game after the coach change if possible and -1 otherwise
    if len(b[:3]) == 3:
        averagePointsSample[-1].append(sum(b[:3])/3)
    else:
        averagePointsSample[-1].append(-1)
    
    #append the average points per game for 5 game after the coach change if possible and -1 otherwise
    if len(b[:5]) == 5:
        averagePointsSample[-1].append(sum(b[:5])/5)
    else:
        averagePointsSample[-1].append(-1)
        
    #append the average points per game for 10 game after the coach change if possible and -1 otherwise
    if len(b[:10]) == 10:
        averagePointsSample[-1].append(sum(b[:10])/10)
    else:
        averagePointsSample[-1].append(-1)

        
#print the new data we created      
for i in averagePointsSample:
    print(i)


[0.8846153846153846, 0, 0.0, 0.2, 0.7, 1.25, 0, 1.0, 1.4, 1.1]
[0.95, 0, 0.0, 0.2, 0.5, 1.1111111111111112, 0, 1.0, 0.6, 0.9]
[0.7, 0, 0.3333333333333333, 0.2, 0.4, 0.5, 1, 0.3333333333333333, 0.8, 0.8]
[1.5, 3, 1.3333333333333333, 1.4, 1.1, 1.5416666666666667, 3, 2.3333333333333335, 2.0, 1.9]
[0.9090909090909091, 0, 0.0, 0.0, 1.0, 1.2962962962962963, 3, 2.0, 1.8, 1.6]
[0.45454545454545453, 1, 0.6666666666666666, 0.4, 0.5, 1.6296296296296295, 1, 0.6666666666666666, 1.0, 1.0]
[0.45454545454545453, 3, 1.0, 1.0, 0.5, 0.6296296296296297, 3, 1.6666666666666667, 1.0, 0.8]
[1.5, 0, 1.0, 1.2, 1.5, 2.0, 1, 2.3333333333333335, 2.2, 2.1]
[1.0, 0, 1.3333333333333333, 0.8, -1, 0.5161290322580645, 0, 1.0, 1.2, 0.6]


In [11]:
#creata list to store all clubs
clubs = []

#add all clubs that are not in the sample (no coaching changes)
for i in data['HomeTeam']:
    if i not in clubs:
        if i not in [u[0] for u in coachData]:
            clubs.append(i)
        
#variable to store all results of the clubs wihtout coaching changes
clubseason = []

#get all results using our function from above with a date after the season
for i in clubs:
    tt1, tt2 = getClubData(i, datetime(2023, 4, 15).date() , data)
    clubseason.append(tt1)
    

#get a list to store control teams
control = []

#go over all teams in the sample
for i in averagePointsSample:
    #store their point average after 1, 3, 5, and 10 games
    one = i[1]
    three = i[2]
    five = i[3]
    ten = i[4]
    
    #set the error to infinity
    err = float("inf")
    #initialize the storage
    store = [[0,0]]
    
    #go over all clubs seasons that could be in the control
    for c in range(len(clubseason)):
        #go over each game of the season
        for u in range(len(clubseason[c])):
            #calculate the point average of the last 1,3,5, and 10 games if possible
            if len(clubseason[c][u-10:u]) == 10:
                onec = sum(clubseason[c][u-1:u])
                threec = sum(clubseason[c][u-3:u])/3
                fivec = sum(clubseason[c][u-5:u])/5
                tenc = sum(clubseason[c][u-10:u])/10
                
                #caclualte the new error by adding the difference between team and control for each game span
                errn = 0
                if one != -1:
                    errn += abs(one - onec)
                if three != -1:
                    errn += abs(three - threec)
                if five != -1:
                    errn += abs(five - fivec)
                if ten != -1:
                    errn +=  abs(ten - tenc)
                    
                       
                #if enough games are left in the season to complete the comparison and if the new error is smaller 
                #then the current error update the error adn store the clubseason and game in the storage 
                if errn < err and u < 29:
                       err = errn
                       store = [[c, u]]
                #add to the storage if the new error is equal to the curren error 
                elif errn == err:
                       store.append([c,u])
                       
    #add the error and store for each sample to the control list
    control.append([err, store])
    
                       
#print contols                      
for i in control:
    print(i)
    print(i)

[0.2, [[0, 24]]]
[0.2, [[0, 24]]]
[0.39999999999999997, [[0, 24]]]
[0.39999999999999997, [[0, 24]]]
[0.0, [[0, 26], [0, 27]]]
[0.0, [[0, 26], [0, 27]]]
[0.09999999999999987, [[3, 13]]]
[0.09999999999999987, [[3, 13]]]
[0.09999999999999998, [[4, 28]]]
[0.09999999999999998, [[4, 28]]]
[0.6, [[1, 19]]]
[0.6, [[1, 19]]]
[0.3999999999999999, [[0, 28]]]
[0.3999999999999999, [[0, 28]]]
[0.19999999999999996, [[8, 15], [9, 28]]]
[0.19999999999999996, [[8, 15], [9, 28]]]
[0.0, [[0, 14], [5, 18]]]
[0.0, [[0, 14], [5, 18]]]


In [12]:
#this list stores the average points of the controls for differn numner of games
averagePointsControl = []

#go over all controls
for u in control:
    #append a list for each control
    averagePointsControl.append([])
    
    #go over all contros (needed if one sample has saveral controls)
    for i in u[1:][0]:
        #append the average points 1,3,5, and 10 games before the hypothetical intervention
        averagePointsControl[-1].append(sum(clubseason[i[0]][i[1]-1:i[1]]))
        averagePointsControl[-1].append( sum(clubseason[i[0]][i[1]-3:i[1]])/3)
        averagePointsControl[-1].append(sum(clubseason[i[0]][i[1]-5:i[1]])/5)
        averagePointsControl[-1].append(sum(clubseason[i[0]][i[1]-10:i[1]])/10)
        
        #append the average points 1,3,5, and 10 games after the hypothetical intervention or -1 if this is not possible
        if len(clubseason[i[0]][i[1]:i[1]+1]) == 1:
            averagePointsControl[-1].append(sum(clubseason[i[0]][i[1]:i[1]+1]))
        else:
            averagePointsControl[-1].append(-1)
        if len(clubseason[i[0]][i[1]:i[1]+3]) == 3:
            averagePointsControl[-1].append(sum(clubseason[i[0]][i[1]:i[1]+3])/3)
        else:
            averagePointsControl[-1].append(-1)
        if len(clubseason[i[0]][i[1]:i[1]+5]) == 5:
            averagePointsControl[-1].append(sum(clubseason[i[0]][i[1]:i[1]+5])/5)
        else:
            averagePointsControl[-1].append(-1)
        if len(clubseason[i[0]][i[1]:i[1]+10]) == 10:
            averagePointsControl[-1].append(sum(clubseason[i[0]][i[1]:i[1]+10])/10)
        else:
            averagePointsControl[-1].append(-1)
        
        
#print the new data
for i in averagePointsControl:
    print(i)

[0, 0.0, 0.0, 0.7, 1, 0.3333333333333333, 1.4, 1.7]
[0, 0.0, 0.0, 0.7, 1, 0.3333333333333333, 1.4, 1.7]
[0, 0.3333333333333333, 0.2, 0.4, 0, 2.0, 1.8, 1.9, 0, 0.3333333333333333, 0.2, 0.4, 3, 2.0, 2.4, 2.2]
[3, 1.3333333333333333, 1.4, 1.2, 1, 1.3333333333333333, 1.4, 0.9]
[0, 0.0, 0.0, 0.9, 0, 1.3333333333333333, 1.4, 1.8]
[1, 0.6666666666666666, 0.6, 0.9, 1, 1.3333333333333333, 1.6, 1.1]
[3, 1.0, 0.8, 0.7, 3, 2.0, 2.4, 1.9]
[0, 1.0, 1.2, 1.7, 3, 3.0, 2.4, 2.5, 0, 1.0, 1.4, 1.5, 0, 0.3333333333333333, 0.8, 0.5]
[0, 1.3333333333333333, 0.8, 1.1, 1, 1.3333333333333333, 1.4, 0.7, 0, 1.3333333333333333, 0.8, 1.2, 3, 1.3333333333333333, 1.0, 1.3]


In [13]:
#print the average points of the sample before the intervention for differnt periods
print(sum([u[1] for u in averagePointsSample])/len([u[1] for u in averagePointsSample]))
print(sum([u[2] for u in averagePointsSample])/len([u[2] for u in averagePointsSample]))
print(sum([u[3] for u in averagePointsSample])/len([u[3] for u in averagePointsSample]))
print(sum([u[4] for u in averagePointsSample])/len([u[4] for u in averagePointsSample]))
print()

#print the average points of the sample after the intervention for differnt periods
print(sum([u[6] for u in averagePointsSample])/len([u[6] for u in averagePointsSample]))
print(sum([u[7] for u in averagePointsSample])/len([u[7] for u in averagePointsSample]))
print(sum([u[8] for u in averagePointsSample])/len([u[8] for u in averagePointsSample]))
print(sum([u[9] for u in averagePointsSample])/len([u[9] for u in averagePointsSample]))
print()

#print the average points of the control before the intervention for differnt periods
print(sum([u[0] for u in averagePointsControl])/len([u[0] for u in averagePointsControl]))
print(sum([u[1] for u in averagePointsControl])/len([u[1] for u in averagePointsControl]))
print(sum([u[2] for u in averagePointsControl])/len([u[2] for u in averagePointsControl]))
print(sum([u[3] for u in averagePointsControl])/len([u[3] for u in averagePointsControl]))
print()

#print the average points of the control after the intervention for differnt periods
print(sum([u[4] for u in averagePointsControl])/len([u[4] for u in averagePointsControl]))
print(sum([u[5] for u in averagePointsControl])/len([u[5] for u in averagePointsControl]))
print(sum([u[6] for u in averagePointsControl])/len([u[6] for u in averagePointsControl]))
print(sum([u[7] for u in averagePointsControl])/len([u[7] for u in averagePointsControl]))
print()

#print the average points of the whole season of the sample before and after coach changes
print(sum([u[0] for u in averagePointsSample])/len([u[0] for u in averagePointsSample]))
print(sum([u[5] for u in averagePointsSample])/len([u[5] for u in averagePointsSample]))

0.7777777777777778
0.6296296296296295
0.6
0.5777777777777778

1.3333333333333333
1.3703703703703705
1.3333333333333333
1.2

0.7777777777777778
0.6296296296296295
0.5555555555555556
0.9222222222222223

1.2222222222222223
1.4444444444444444
1.688888888888889
1.5777777777777777

0.928088578088578
1.1638291517323776


In [14]:
#print the p-values comparing the samples to the controls after the coach change 
print(stats.ttest_ind([u[6] for u in averagePointsSample],[u[4] for u in averagePointsControl]))
print(stats.ttest_ind([u[7] for u in averagePointsSample],[u[5] for u in averagePointsControl]))
print(stats.ttest_ind([u[8] for u in averagePointsSample],[u[6] for u in averagePointsControl]))
print(stats.ttest_ind([u[9] for u in averagePointsSample],[u[7] for u in averagePointsControl]))
print()

#print the p-values comparing the samples after the coach change to the average points before the coach change
print(stats.ttest_ind([u[0] for u in averagePointsSample],[u[5] for u in averagePointsSample]))
print(stats.ttest_ind([u[0] for u in averagePointsSample],[u[6] for u in averagePointsSample]))
print(stats.ttest_ind([u[0] for u in averagePointsSample],[u[7] for u in averagePointsSample]))
print(stats.ttest_ind([u[0] for u in averagePointsSample],[u[8] for u in averagePointsSample]))
print(stats.ttest_ind([u[0] for u in averagePointsSample],[u[9] for u in averagePointsSample]))
print()

#print the p-values comparing the samples before and after the coach change 
print(stats.ttest_ind([u[1] for u in averagePointsSample],[u[6] for u in averagePointsSample]))
print(stats.ttest_ind([u[2] for u in averagePointsSample],[u[7] for u in averagePointsSample]))
print(stats.ttest_ind([u[3] for u in averagePointsSample],[u[8] for u in averagePointsSample]))
print(stats.ttest_ind([u[4] for u in averagePointsSample],[u[9] for u in averagePointsSample]))

Ttest_indResult(statistic=0.19425717247145252, pvalue=0.8484195024033094)
Ttest_indResult(statistic=-0.2000000000000003, pvalue=0.84400045171129)
Ttest_indResult(statistic=-1.5220847812943823, pvalue=0.14750514669360323)
Ttest_indResult(statistic=-1.4517458762932818, pvalue=0.16589857843075567)

Ttest_indResult(statistic=-1.08565961394892, pvalue=0.29371667567815746)
Ttest_indResult(statistic=-0.8830740551517956, pvalue=0.39027187841303757)
Ttest_indResult(statistic=-1.6026318905225143, pvalue=0.1285716155962935)
Ttest_indResult(statistic=-1.801686455590233, pvalue=0.09046728632668163)
Ttest_indResult(statistic=-1.2435172008675182, pvalue=0.23158990193870294)

Ttest_indResult(statistic=-0.8980265101338744, pvalue=0.3824864493488148)
Ttest_indResult(statistic=-2.3990405756162683, pvalue=0.02897455800029147)
Ttest_indResult(statistic=-2.913971185543096, pvalue=0.010142486960485534)
Ttest_indResult(statistic=-2.131497459481348, pvalue=0.048900760200165164)
