In [1]:
# Imports
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import math

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

from datetime import datetime, timedelta

# import sqlite3
# from airflow import DAG
# from airflow.utils.dates import days_ago
# from airflow.operators.python_operator import PythonOperator

import nbformat

import warnings
from typing import Optional, Tuple, Callable, Dict, Any, List, Union
import fire

warnings.filterwarnings('ignore')


from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet, Lasso, Ridge, SGDRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

from sklearn.discriminant_analysis import StandardScaler

from fastapi import FastAPI

import pickle
import os

import streamlit as st
# Create a requirements.txt file with the necessary packages
# !pip freeze > airflow/dags/requirements.txt


In [2]:
from app.load_data import load_data_from_source
from app.transform_data import transform_data_into_features_and_targets
from app.baseline_model import train_baseline

from app.train_models import train_model, save_best_model, training_process

from app.model_prediction import predict_res

In [3]:
# ---- VARIABLES ----
explanable_cols = ['Player','Nation','Pos','Squad','Age','Born','Starts','Min','Gls','Total_Att','Blocks_Blocks','Blocks_Sh','Blocks_Pass','Clr','Err','Touches_Touches','Touches_DefPen','Dribbles_Succ','Dribbles_Att','Dribbles_Mis','AerialDuels_Won','AerialDuels_Lost']
file_loc       = 'airflow/dags/assets/matches-checkpoint.csv'
spanish_squads = ['Sevilla', 'Sporting Huelva', 'Athletic Club', 'Levante Planas',
                  'UDG Tenerife', 'Villarreal', 'Madrid CFF', 'Barcelona',
                  'Atlético Madrid', 'Real Madrid', 'Alhama', 'Alavés',
                  'Real Sociedad', 'Levante', 'Real Betis', 'Valencia']

## Load Data

In [4]:
data = load_data_from_source()
display(data.head())

<class 'pandas.core.frame.DataFrame'>
Index: 87 entries, 0 to 86
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Wk         87 non-null     int64  
 1   Day        87 non-null     object 
 2   Date       87 non-null     object 
 3   Time       87 non-null     object 
 4   Home       87 non-null     object 
 5   xGHome     87 non-null     float64
 6   Score      87 non-null     object 
 7   xGAway     87 non-null     float64
 8   Away       87 non-null     object 
 9   Home_id    87 non-null     object 
 10  Away_id    87 non-null     object 
 11  Match_id   87 non-null     object 
 12  League_id  87 non-null     int64  
 13  xPHome     87 non-null     float64
 14  xPAway     87 non-null     float64
 15  ScoreHome  87 non-null     int64  
 16  ScoreAway  87 non-null     int64  
dtypes: float64(4), int64(4), object(9)
memory usage: 12.2+ KB
(87, 17) None


Unnamed: 0,Wk,Day,Date,Time,Home,xGHome,Score,xGAway,Away,Home_id,Away_id,Match_id,League_id,xPHome,xPAway,ScoreHome,ScoreAway
0,2,Sat,2022-09-17,12:00,Barcelona,1.9,2–0,0.4,UDG Tenerife,15f49df1,4c088abe,4df3a732,230,2.4,0.4,2,0
1,2,Sat,2022-09-17,12:00,Alavés,1.1,1–2,1.2,Madrid CFF,aa11fb42,89818574,87c755cd,230,1.29,1.43,1,2
2,2,Sat,2022-09-17,16:00,Real Sociedad,0.7,2–0,0.3,Villarreal,c21f25d3,7a7bef84,abfde9d9,230,1.68,0.87,2,0
3,2,Sat,2022-09-17,16:00,Real Madrid,1.6,2–0,0.8,Valencia,54582b93,f96ff499,d0329f46,230,1.94,0.8,2,0
4,2,Sat,2022-09-17,18:00,Sevilla,1.1,1–3,1.4,Atlético Madrid,215d9026,b56c2667,f4452586,230,1.15,1.58,1,3


## Transform Data

Transform the loaded data into _features_ and _target_


In [5]:
features_home, target_home = transform_data_into_features_and_targets(df=data, score='Home')
features_away, target_away = transform_data_into_features_and_targets(df=data, score='Away')

# Remember that features_home and features_away are the same, so we can use either one
# However, target_home and target_away are different, so we need to concatenate them in order to have the full target

np.unique([data.Home.unique(), data.Away.unique()])

array(['Alavés', 'Alhama', 'Athletic Club', 'Atlético Madrid',
       'Barcelona', 'Levante', 'Levante Planas', 'Madrid CFF',
       'Real Betis', 'Real Madrid', 'Real Sociedad', 'Sevilla',
       'Sporting Huelva', 'UDG Tenerife', 'Valencia', 'Villarreal'],
      dtype=object)

# Modelling

## Baseline Model
Establish a baseline performance against which to compare future better models

In [6]:
train_baseline(features_home, target_home)
train_baseline(features_away, target_away)

Baseline error: 1.1481481481481484
Baseline error: 1.5555555555555556


## Other Models

In [7]:

rankings_home = {}
training_process(rankings_home, features_home, target_home)
# train_model(rankings_home,'LinearRegression', features_home, target_home)
# train_model(rankings_home,'AdaBoostRegressor', features_home, target_home)
# train_model(rankings_home,'RandomForestRegressor', features_home, target_home)
# train_model(rankings_home,'XGBRegressor', features_home, target_home)
# # train_model(rankings_home,'DecisionTreeRegressor', features_home, target_home)
# # train_model(rankings_home,'KNeighborsRegressor', features_home, target_home)
# # train_model(rankings_home,'GradientBoostingRegressor', features_home, target_home)
# # train_model(rankings_home,'Ridge', features_home, target_home)
# # train_model(rankings_home,'Lasso', features_home, target_home)
# # train_model(rankings_home,'ElasticNet', features_home, target_home)
# # train_model(rankings_home,'SGDRegressor', features_home, target_home)
# # train_model(rankings_home,'KernelRidge', features_home, target_home)

# print(sorted(rankings_home.items(), key=lambda x: x[1]['mae']))

# Save models and their info
save_best_model('home',rankings_home)

Unnamed: 0,model,params,mae,time
0,LinearRegression(n_jobs=1),"{'positive': False, 'n_jobs': 1, 'fit_intercep...",1.088379,0 days 00:00:05.154085
1,"KernelRidge(degree=1, gamma=0.1, kernel='poly')","{'kernel': 'poly', 'gamma': 0.1, 'degree': 1, ...",1.119095,0 days 00:00:00.266998
2,"Ridge(alpha=10, solver='cholesky')","{'solver': 'cholesky', 'fit_intercept': True, ...",1.127626,0 days 00:00:00.261246
3,"SGDRegressor(alpha=0.1, learning_rate='adaptiv...","{'shuffle': True, 'penalty': 'l2', 'loss': 'ep...",1.20564,0 days 00:00:00.256997
4,"XGBRegressor(base_score=None, booster=None, ca...","{'n_estimators': 200, 'max_depth': 4}",1.262268,0 days 00:00:02.541548
5,([DecisionTreeRegressor(criterion='friedman_ms...,"{'n_estimators': 200, 'max_depth': 2}",1.262769,0 days 00:00:04.589863
6,"(DecisionTreeRegressor(max_depth=3, random_sta...","{'n_estimators': 100, 'learning_rate': 0.01}",1.303815,0 days 00:00:10.596427
7,"(DecisionTreeRegressor(max_depth=2, max_featur...","{'n_estimators': 150, 'max_depth': 2}",1.35688,0 days 00:00:08.584825
8,ElasticNet(alpha=1),"{'l1_ratio': 0.5, 'fit_intercept': True, 'alph...",1.358025,0 days 00:00:00.244288
9,DecisionTreeRegressor(max_depth=2),{'max_depth': 2},1.359954,0 days 00:00:00.259384


In [8]:
rankings_away = {}
training_process(rankings_away, features_away, target_away)
# train_model(rankings_away, 'LinearRegression', features_away, target_away)
# train_model(rankings_away, 'AdaBoostRegressor', features_away, target_away)
# train_model(rankings_away, 'RandomForestRegressor', features_away, target_away)
# train_model(rankings_away, 'XGBRegressor', features_away, target_away)
# train_model(rankings_away, 'DecisionTreeRegressor', features_away, target_away)
# train_model(rankings_away, 'KNeighborsRegressor', features_away, target_away)
# train_model(rankings_away, 'GradientBoostingRegressor', features_away, target_away)
# train_model(rankings_away, 'Ridge', features_away, target_away)
# train_model(rankings_away, 'Lasso', features_away, target_away)
# train_model(rankings_away, 'ElasticNet', features_away, target_away)
# train_model(rankings_away, 'SGDRegressor', features_away, target_away)
# train_model(rankings_away, 'KernelRidge', features_away, target_away)

print(sorted(rankings_away.items(), key=lambda x: x[1]['mae']))

# Save models and their info
save_best_model('away',rankings_away)



[('KNeighborsRegressor', {'model': KNeighborsRegressor(n_neighbors=4), 'params': {'n_neighbors': 4}, 'mae': 0.75, 'time': datetime.timedelta(microseconds=213178)}), ('KernelRidge', {'model': KernelRidge(gamma=0.1, kernel='rbf'), 'params': {'kernel': 'rbf', 'gamma': 0.1, 'degree': 3, 'alpha': 1}, 'mae': 0.8088054038390681, 'time': datetime.timedelta(microseconds=292240)}), ('AdaBoostRegressor', {'model': AdaBoostRegressor(learning_rate=0.01), 'params': {'n_estimators': 50, 'learning_rate': 0.01}, 'mae': 0.9097736385562363, 'time': datetime.timedelta(seconds=8, microseconds=658844)}), ('ElasticNet', {'model': ElasticNet(alpha=1, l1_ratio=0.9), 'params': {'l1_ratio': 0.9, 'fit_intercept': True, 'alpha': 1}, 'mae': 0.9219185482824981, 'time': datetime.timedelta(microseconds=218785)}), ('Lasso', {'model': Lasso(alpha=1), 'params': {'fit_intercept': True, 'alpha': 1}, 'mae': 0.9353056410838394, 'time': datetime.timedelta(microseconds=185569)}), ('Ridge', {'model': Ridge(alpha=1, solver='sag'

Unnamed: 0,model,params,mae,time
0,KNeighborsRegressor(n_neighbors=4),{'n_neighbors': 4},0.75,0 days 00:00:00.213178
1,"KernelRidge(gamma=0.1, kernel='rbf')","{'kernel': 'rbf', 'gamma': 0.1, 'degree': 3, '...",0.808805,0 days 00:00:00.292240
2,"(DecisionTreeRegressor(max_depth=3, random_sta...","{'n_estimators': 50, 'learning_rate': 0.01}",0.909774,0 days 00:00:08.658844
3,"ElasticNet(alpha=1, l1_ratio=0.9)","{'l1_ratio': 0.9, 'fit_intercept': True, 'alph...",0.921919,0 days 00:00:00.218785
4,Lasso(alpha=1),"{'fit_intercept': True, 'alpha': 1}",0.935306,0 days 00:00:00.185569
5,"Ridge(alpha=1, solver='sag')","{'solver': 'sag', 'fit_intercept': True, 'alph...",0.937556,0 days 00:00:00.224072
6,"LinearRegression(n_jobs=1, positive=True)","{'positive': True, 'n_jobs': 1, 'fit_intercept...",0.939705,0 days 00:00:00.226967
7,"(DecisionTreeRegressor(max_depth=2, max_featur...","{'n_estimators': 100, 'max_depth': 2}",0.956498,0 days 00:00:10.234840
8,"SGDRegressor(alpha=0.01, learning_rate='adapti...","{'shuffle': False, 'penalty': 'l1', 'loss': 'e...",0.976012,0 days 00:00:00.312049
9,DecisionTreeRegressor(max_depth=3),{'max_depth': 3},0.994152,0 days 00:00:00.200815


# Load Models from Pickle file

Load the models from the pickle to a REST API. I'm going to use _fastapi_

## Predictions

In [9]:
# ---- PREDICTIONS ----

cols: list[str] = ['Wk', 'Numeric_Day', 'Numeric_Home', 'Numeric_Away', 'Numeric_Time',
       'xGHome_xGAway_1', 'xGHome_xGAway_2', 'xGHome_xGAway_3']
# display(features_away[cols].sample(1))

# Pick random number between 0 and len(data)
random_index = np.random.randint(0, len(data))

In [10]:
# display(features_away[cols][random_index:random_index+1], features_home[cols][random_index:random_index+1])
# Predict home score for match at random index
predict_res(features_home[cols][random_index:random_index+1])

'predicting ... 8 : 1'

In [11]:
predict_res(features_away[cols][random_index:random_index+1])

# The prediction is the same for both because the features are the same.

'predicting ... 8 : 1'

## Use Streamlit to transform the data, load the model and do a prediction

In [12]:
print(..)

SyntaxError: invalid syntax (2448822055.py, line 1)

### ETL Process and Data Integration

__Apache Airflow__ supports a few databases: 
- SQLite _Lightweight filebased database suitable for small-scale deployments and testing_
- PostgreSQL _Relational database widely used in production environments_
- MySQL _Popular relational database widely used_
- Microsoft SQL Server _Commercial relational database widely used in enterprises_
- Oracle _Commercial relational database widely used in enterprises_
- Amazon RedShift _Cloud-based data warehouse optimized for analytics workloads_
- Google BigQuery _Cloud-based data warehouse optimized for analytics workloads_
- Apache Casssandra _Distributed No-SQL database optimized for high scalability and availability_
- Apache Hive _Data warehouse infrastructure for data summarization, querying and analytics_

I'm using SQLite because it's a small scale dataset

In [None]:
# Load task
def load_data(matches):
    # Connect to database
    conn = sqlite3.connect('assets/spanish_matches.db')

    # Create cursor
    c = conn.cursor()

    # Create table
    c.execute("""CREATE TABLE IF NOT EXISTS matches (
        Wk INTERGER,
        Day TEXT,
        Date DATE,
        Time TIME,
        Home TEXT,
        xGHome FLOAT,
        Score TEXT,
        xGAway FLOAT,
        Away TEXT,
        xPHome FLOAT,
        xPAway FLOAT,
        ScoreHome INTERGER,
        ScoreAway INTERGER,
        GoalDifference INTERGER,
        Result TEXT,
        ExpectedGoalDifference FLOAT,
        Points INTERGER,
        ExpectedPoints INTERGER,
        WinPercentage FLOAT,
        TotalGoals INTERGER,
        xGRatio FLOAT
    )""")

    # Insert DataFrame records one by one.
    for i, row in matches.iterrows():
        c.execute("""INSERT INTO matches VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", (
            row['Wk'],
            row['Day'],
            row['Date'],
            row['Time'],
            row['Home'],
            row['xGHome'],
            row['Score'],
            row['xGAway'],
            row['Away'],
            row['xPHome'],
            row['xPAway'],
            row['ScoreHome'],
            row['ScoreAway'],
            row['GoalDifference'],
            row['Result'],
            row['ExpectedGoalDifference'],
            row['Points'],
            row['ExpectedPoints'],
            row['WinPercentage'],
            row['TotalGoals'],
            row['xGRatio']
        ))

    # Commit changes
    conn.commit()

    # Close cursor and connection
    c.close()
    conn.close()


In [None]:
# Assign tasks
extract_task = PythonOperator(
    task_id='extract_data',
    python_callable=extract_data,
    dag=dag
)

transform_task = PythonOperator(
    task_id='transform_data',
    python_callable=transform_data,
    op_kwargs={'matches': '{{ ti.xcom_pull(task_ids="extract_data") }}'},
    dag=dag
)

load_task = PythonOperator(
    task_id='load_data',
    python_callable=load_data,
    op_kwargs={'matches': '{{ ti.xcom_pull(task_ids="transform_data") }}'},
    dag=dag
)

In [None]:
!~/airflow_env/bin/airflow scheduler -D
!~/airflow_env/bin/airflow webserver -D

!~/airflow_env/bin/airflow dags list

!~/airflow_env/bin/airflow cheat-sheet

## Validation and Quality Assurance

To validate the quality of the data, I'm connecting to the database to check for null values in each column of the matches table.  
Specifically, I'll check the data type, the range anf completeness of the data

In [None]:
# Validate data in the database and ensure the proper quality
def validate_data():
    # Connect to database
    conn = sqlite3.connect('assets/spanish_matches.db')

    # Create cursor
    c = conn.cursor()

    # Data type validation
    c.execute("""SELECT COUNT(*) FROM matches where CAST(Wk AS INTEGER) IS NULL""")
    null_count = c.fetchone()[0]
    if null_count == 0:
        print('Data type validation passed.')
    else:
        print(f'Data type validation failed with {null_count} null values.')

    # Data range validation
    c.execute("""SELECT COUNT(*) FROM matches where Wk < 1 OR Wk > 10""")
    range_count = c.fetchone()[0]
    if range_count == 0:
        print('Data range validation passed.')
    else:
        print(f'Data range validation failed with {range_count} values out of range.')

    # Data completeness validation
    c.execute("""SELECT COUNT(*) FROM matches where Wk IS NULL""")
    completeness_count = c.fetchone()[0]
    if completeness_count == 0:
        print('Data completeness validation passed.')
    else:
        print(f'Data completeness validation failed with {completeness_count} null values.')

    c.close()
    conn.close()

validate_task = PythonOperator(
    task_id='validate_data',
    python_callable=validate_data,
    op_kwargs={'matches': '{{ ti.xcom_pull(task_ids="load_data") }}'},
    dag=dag
)

# Define task dependencies
extract_task >> transform_task >> load_task >> validate_task

## Reporting and Analysis

Generate meaningful insights and reports.
- Trend analysis
- Team Performance analysis
- Team comparisons

In [None]:
# Trend analysis
def trend_analysis():
    # Connect to database
    conn = sqlite3.connect('assets/spanish_matches.db')

    # Create cursor
    c = conn.cursor()

    # Data type validation
    c.execute("""SELECT * FROM matches""")
    matches = pd.DataFrame(c.fetchall())
    
    # Define plot function
    def plot_data():
        sns.lineplot(x='Date', y='TotlaGoals', data=matches)
        plt.title('Total Goals Scored')
        plt.xlabel('Date')
        plt.ylabel('Total Goals')
        plt.show()
    # Look at the correlation between the expected goals and the actual goals
    def calculate_correlation():
        corr_home = matches['xGHome'].corr(matches['ScoreHome'])
        print(f'Correlation between expected Goals for the Home and actual goals Home: {corr_home}')
        corr_away = matches['xGAway'].corr(matches['ScoreAway'])
        print(f'Correlation between expected Goals for the Away and actual goals Away: {corr_away}')
    
    plot_data()
    calculate_correlation()

trend_analysis_task = PythonOperator(
    task_id='trend_analysis',
    python_callable=trend_analysis,
    op_kwargs={'matches': '{{ ti.xcom_pull(task_ids="validate_data") }}'},
    dag=dag
)   

# Define task dependencies
extract_task >> transform_task >> load_task >> validate_task >> trend_analysis_task

In [None]:
# Exectute the DAG workflow and view the results in the Airflow UI from scripts/DataPipelining.py
!~/airflow_env/bin/airflow trigger_dag 

In [None]:
# Convert notebook to python script
!jupyter nbconvert --to script DataPipelining.ipynb --output-dir='airflow/dags/'