In [1]:
import pandas as pd

In [20]:
# Load the datasets

teams = pd.read_csv('data/teams.csv').convert_dtypes()
results = pd.read_csv('data/results.csv').convert_dtypes()
fixtures = pd.read_csv('data/fixtures.csv').convert_dtypes()
players = pd.read_csv('data/players.csv').convert_dtypes()
startingXI = pd.read_csv('data/startingXI.csv').convert_dtypes()
odds = pd.read_csv('data/odds.csv').convert_dtypes()

datasets = [teams, results, fixtures, players, startingXI, odds]

In [22]:
# inspect values and if any values are missing 
for dataset in datasets:
  print(dataset.head())
  print(dataset.info())
  print('-------------')

    TeamName  TeamID
0  Arlington       1
1    Anaheim       2
2    Atlanta       3
3  Baltimore       4
4     Boston       5
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   TeamName  28 non-null     string
 1   TeamID    28 non-null     Int64 
dtypes: Int64(1), string(1)
memory usage: 608.0 bytes
None
-------------
   SeasonID  Gameweek  MatchID  HomeTeamID  HomeScore  HomeShots  AwayTeamID  \
0         1         1        1           7          1         17           1   
1         1         1        2           6          1          8           8   
2         1         1        3           5          5         21           9   
3         1         1        4           4          2         25          10   
4         1         1        5           3          3         13          11   

   AwayScore  AwayShots  
0          1         12  
1          3  

In [23]:
results['SeasonID'].value_counts()

SeasonID
1    756
2    756
Name: count, dtype: Int64

Input data summary:
* No null or missing values, correct data types 
* Each table and column names are self explanatory
* Dataset provided for season 1 & season 2 

# Exploring the First Season

This league uses the same rules for determining the order of teams as the English Premier League.


### 1. Which team won the league in the first season?


In [42]:
# Merge results with teams 
results_with_teams = results.merge(teams, left_on='HomeTeamID', right_on='TeamID')

# Calculate match points: win 3 , draw 1 , loss 0
results_with_teams['HomePoints'] = results_with_teams.apply(lambda x: 3 if x['HomeScore'] > x['AwayScore'] else (1 if x['HomeScore'] == x['AwayScore'] else 0), axis=1)
results_with_teams['AwayPoints'] = results_with_teams.apply(lambda x: 3 if x['HomeScore'] < x['AwayScore'] else (1 if x['HomeScore'] == x['AwayScore'] else 0), axis=1)

# pick only season 1 
results_with_teams_s1 = results_with_teams[results_with_teams['SeasonID'] == 1]

# add up both home and away points, get the team name as well  
home_points = results_with_teams_s1.groupby('HomeTeamID').agg(HomePoints = pd.NamedAgg(column='HomePoints', aggfunc='sum'),
                                                              TeamName = pd.NamedAgg(column='TeamName', aggfunc='first')).reset_index()
away_points = results_with_teams_s1.groupby('AwayTeamID')['AwayPoints'].sum().reset_index()

# add up  the home and away points 
total_points = home_points.merge(away_points, left_on='HomeTeamID', right_on='AwayTeamID')
total_points['TotalPoints'] = total_points['HomePoints'] + total_points['AwayPoints']

In [48]:
ranking_s1 = total_points.sort_values(by='TotalPoints', ascending=False)[['HomeTeamID', 'TeamName', 'TotalPoints']].reset_index(drop=True)

In [49]:
# Season 1 teams ranked by points
ranking_s1

Unnamed: 0,HomeTeamID,TeamName,TotalPoints
0,15,Miami,138
1,8,Cincinnati,125
2,4,Baltimore,117
3,19,New York S,113
4,5,Boston,106
5,27,Seattle,105
6,6,Chicago B,105
7,21,Oakland,96
8,7,Chicago H,95
9,24,St. Louis,94


Results: The winning team is Miami with 138 points in the first season. With Cinccinnati and Baltimore securing the 2nd and 3rd position consecutively. 

### 2. At what point in the season did that team secure their league title?
3. What result was the biggest upset?