In [1]:
import os

## DOWNLOAD DATA

In [2]:
data_ingestion_content = """
import pandas as pd
import requests

def data_ingestion():
    ''''
    Component to download data from a github repo.
    '''
    df_allseasons = pd.read_csv('https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/cleaned_merged_seasons.csv', index_col = 'Unnamed: 0')

    # Get yearly historic data from endpoint for available seasons and identify the keys in each disctionary using 2016 as an example.

    Y2016= requests.get('https://www.fantasynutmeg.com/api/history/season/2016-17').json()
    Y2017= requests.get('https://www.fantasynutmeg.com/api/history/season/2017-18').json()
    Y2018= requests.get('https://www.fantasynutmeg.com/api/history/season/2018-19').json()
    Y2019= requests.get('https://www.fantasynutmeg.com/api/history/season/2019-20').json()  
    Y2020= requests.get('https://www.fantasynutmeg.com/api/history/season/2020-21').json()
    Y2021= requests.get('https://www.fantasynutmeg.com/api/history/season/2021-22').json()
    Y2022= requests.get('https://www.fantasynutmeg.com/api/history/season/2022-23').json()

    # Convert history data dictionary to a pandas dataframe.

    hist16_df = pd.DataFrame(Y2016['history'])
    hist17_df = pd.DataFrame(Y2017['history'])
    hist18_df = pd.DataFrame(Y2018['history'])
    hist19_df = pd.DataFrame(Y2019['history'])
    hist20_df = pd.DataFrame(Y2020['history'])
    hist21_df = pd.DataFrame(Y2021['history'])

    # Engineer feature to highlight each season year.

    hist16_df['year'] = hist16_df.apply(lambda x: "2016-17", axis=1)
    hist17_df['year'] = hist17_df.apply(lambda x: "2017-18", axis=1)
    hist18_df['year'] = hist18_df.apply(lambda x: "2018-19", axis=1)
    hist19_df['year'] = hist19_df.apply(lambda x: "2019-20", axis=1)
    hist20_df['year'] = hist20_df.apply(lambda x: "2020-21", axis=1)
    hist21_df['year'] = hist21_df.apply(lambda x: "2021-22", axis=1)

    # Concatenate all history data across years.

    hist_df = [hist16_df, hist17_df, hist18_df, hist19_df, hist20_df, hist21_df]
    hist = pd.concat(hist_df, axis = 0, ignore_index=True)


    #get current season data from FPL API endpoints and identify the keys
    fpl_base_url = 'https://fantasy.premierleague.com/api/'
    current_season = requests.get(fpl_base_url+'bootstrap-static/').json()

    #create dataframes for the current season dictionary keys for data exploration
    #- Contains summary of Gameweek data
    events_df = pd.DataFrame(current_season['events']) #
    phases_df = pd.DataFrame(current_season['phases']) #Shows calendar months for game weeks
    teams_df = pd.DataFrame(current_season['teams'])
    players_df = pd.DataFrame(current_season['elements'])
    element_stats_df = pd.DataFrame(current_season['element_stats'])
    element_types_df = pd.DataFrame(current_season['element_types'])

    for x in players_df.index :
        player_id = players_df.id[x]
        url = f'https://fantasy.premierleague.com/api/element-summary/{player_id}/'
        r = requests.get(url)
        json = r.json()
        json_history_df = pd.DataFrame(json['history'])

       
        if x == 0 :
            df_currentseason = json_history_df
        else : 
            df_currentseason = df_currentseason.append(json_history_df)


    #get current season fixtures from FPL API endpoint and create Dataframe
    current_season_fixtures = requests.get(fpl_base_url+'fixtures/').json()
    fixtures_df = pd.DataFrame(current_season_fixtures)

    #Map the team names and the player positions into the players_df_clean dataframe
    teams_now=dict(zip(teams_df.id, teams_df.short_name))
    positions=dict(zip(element_types_df.id, element_types_df.singular_name_short))
    players_df['club_name'] = players_df['team'].map(teams_now)
    players_df['position'] = players_df['element_type'].map(positions)



    
    df_allseasons.to_csv('all_seasons_hist.csv')
    hist.to_csv('hist_data.csv')
    players_df.to_csv('players_df.csv')
    df_currentseason.to_csv('currentseason.csv')
    fixtures_df.to_csv('fixtures.csv')

if __name__ == "__main__":
    data_ingestion()
    """

In [3]:
data_download_path = "./data_download"
if not os.path.exists(data_download_path):
    os.makedirs(data_download_path)

with open(f"{data_download_path}/data_download.py", 'w') as data_download_file:
    data_download_file.write(data_ingestion_content)

In [4]:
# Defining the content of the requirement.txt file
download_req_content = """
pandas
requests
"""

In [5]:
# Writing the content to the data preprocessing requirements.txt file

download_req_path = data_download_path

with open(f"{download_req_path}/requirements.txt", 'w') as download_req:
    download_req.write(download_req_content)

In [6]:
# Defining the content of the data preprocessing Dockerfile

data_dockerfile_content = f"""
FROM python:3.10.4-slim-buster

WORKDIR /app

COPY {data_download_path}/requirements.txt /app
RUN pip install -r requirements.txt

COPY {data_download_path}/data_download.py /app
"""

In [7]:
# Writing the contents to the data downloader Dockerfile

data_dockerfile_path = data_download_path

with open(f"{data_dockerfile_path}/Dockerfile", 'w') as data_dockerfile:
    data_dockerfile.write(data_dockerfile_content)

In [8]:
# A fuction to Build the docker image for a component

def docker_image_build(image_name, image_tag):
    # Build the docker image of the component
    os.system(f"docker build -t {image_name}:{image_tag} -f {dockerfile_dir}/Dockerfile .")
    # Tag the built docker image
    os.system(f"docker tag {image_name}:{image_tag} {username}/{image_name}:{image_tag}")
    # Push the image to the docker container registry
    os.system(f"docker push {username}/{image_name}:{image_tag}")
    docker_image = f"{username}/{image_name}:{image_tag}"
    return docker_image

In [9]:
# Defining the parameters for the docker image
dockerfile_dir = data_dockerfile_path
username = input('Enter your docker hub username: ')
password = input('Enter your docker hub account password: ')
image_name = 'data-download-fpl'
image_tag = 'latest'

# Logging into docker account
os.system(f"docker login -u {username} -p {password}")

# Building the data download image and pushing it to the container registry
data_download_image = docker_image_build(image_name, image_tag)

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



Login Succeeded
Sending build context to Docker daemon  44.83MB
Step 1/5 : FROM python:3.10.4-slim-buster
 ---> e00cda196d23
Step 2/5 : WORKDIR /app
 ---> Using cache
 ---> 875dc2c67660
Step 3/5 : COPY ./data_download/requirements.txt /app
 ---> Using cache
 ---> 2eaf33bec289
Step 4/5 : RUN pip install -r requirements.txt
 ---> Using cache
 ---> 342f2dccdbf0
Step 5/5 : COPY ./data_download/data_download.py /app
 ---> Using cache
 ---> 3ae22c1717a8
Successfully built 3ae22c1717a8
Successfully tagged data-download-fpl:latest
The push refers to repository [docker.io/pelvic/data-download-fpl]
85492addfff0: Preparing
ab5918050c4c: Preparing
bc3f4dc46e81: Preparing
e189e6d19dae: Preparing
c41bbba2c89c: Preparing
361093c2629f: Preparing
69b6043419ca: Preparing
4bdae028fbe3: Preparing
10e6bc6fdee2: Preparing
361093c2629f: Waiting
69b6043419ca: Waiting
4bdae028fbe3: Waiting
10e6bc6fdee2: Waiting
bc3f4dc46e81: Layer already exists
c41bbba2c89c: Layer already exists
e189e6d19dae: Layer already ex

## FEATURE ENGINEERING

In [10]:
# Creating the content of the feature engineering script

feature_eng_content = """
import pandas as pd
import numpy as np
import argparse

def feat_eng(args):
    df_allseasons = pd.read_csv(args.allseasons)
    hist = pd.read_csv(args.hist)
    players_df = pd.read_csv(args.players_df)
    df_currentseason = pd.read_csv(args.currentseason)
    fixtures_df = pd.read_csv(args.fixtures_df)

    # FOR THE TRAIN AND VALIDATION SET
    # Engineer feature to highlight the form of the players.
    hist['form'] = hist['total_points']/38 

    # Engineer feature to highlight the players name and the season they played in.
    hist['name_season'] = hist['first_name'] + ' ' + hist['second_name'] + '_' + hist['year']

    # Engineer feature to highlight the players name and the season they played in.
    df_allseasons['name_season'] = df_allseasons['name'] + '_' + df_allseasons['season_x']

    # Engineer a feature to highlight the club of the player.
    teams=dict(zip(hist.name_season, hist.team_name))

    df_allseasons['club_name'] = df_allseasons['name_season'].map(teams)

    # Engineer a feature to highlight the form of the player.
    teams=dict(zip(hist.name_season, hist.form))

    df_allseasons['form'] = df_allseasons['name_season'].map(teams)

    # Engineer feature to highlight the game dates from kickoff_time.
    df_allseasons['game_date'] = df_allseasons['kickoff_time'].str.replace('T', ' ')
    df_allseasons['game_date'] = df_allseasons['game_date'].str.replace(':00Z', '')

    # Convert game_date feature to appropriate dtype.
    df_allseasons['game_date'] = pd.to_datetime(df_allseasons['game_date'])

    # Engineer game season weather feature.
    seasons = [1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1]
    month_to_season = dict(zip(range(1,13), seasons))
    df_allseasons['game_weather'] = df_allseasons.game_date.dt.month.map(month_to_season)

    # Engineer feature to highlights games that started before 13:00 (early starts) and those that started after 13:00 (late starts)
    df_allseasons['start_label'] = np.where((df_allseasons['game_date'].dt.hour) < 13, 0, 1)
    # Engineer feature to highlight the game year only.
    df_allseasons['year'] = df_allseasons.game_date.dt.year

    # Drop feature.
    df_allseasons.drop('team_x', axis = 1, inplace=True)
    # Drop all missing observations.
    df_allseasons.dropna(inplace=True)

    # Change dypes.
    df_allseasons['team_h_score'] = df_allseasons['team_h_score'].astype(int)
    df_allseasons['team_a_score'] = df_allseasons['team_a_score'].astype(int)

    # Drop features.
    df_allseasons.drop(['opponent_team', 'kickoff_time'], axis = 1, inplace=True)

    # Drop features.
    df_allseasons.drop(['season_x', 'name', 'name_season', 'fixture', 'game_date', 'round', 'element'], axis=1, inplace=True)
    # Drop all players with zero playtime.
    zero_minutes = df_allseasons[df_allseasons.minutes == 0].index
    df_allseasons.drop(zero_minutes, axis = 0, inplace=True)
    df_allseasons.set_index('year', inplace=True)

    # FEATURING ENGINEERING FOR THE TEST SET
    #create the player name feature
    players_df['name'] = players_df['first_name'] + ' ' + players_df['second_name']
    #Create season_x feature to align with the train data
    df_currentseason['season_x'] = df_currentseason.apply(lambda x: "2022-23", axis=1)
    #Map the team names, player names and form into the all current season data player dataframe
    teams_map=dict(zip(players_df.id, players_df.name))
    club_map=dict(zip(players_df.id, players_df.club_name))
    opp_teams_map=dict(zip(players_df.team, players_df.club_name))
    form_map=dict(zip(players_df.id, players_df.form))
    position_map=dict(zip(players_df.id, players_df.position))
    df_currentseason['name'] = df_currentseason['element'].map(teams_map)
    df_currentseason['club_name'] = df_currentseason['element'].map(club_map)
    df_currentseason['opp_team_name'] = df_currentseason['opponent_team'].map(opp_teams_map)
    df_currentseason['form'] = df_currentseason['element'].map(form_map)
    df_currentseason['position'] = df_currentseason['element'].map(form_map)

    df_currentseason.drop(['Unnamed: 0'], axis=1, inplace=True)
    play_zero_minutes = df_currentseason[df_currentseason.minutes == 0].index
    df_currentseason.drop(play_zero_minutes, axis = 0, inplace=True)
    df_currentseason.rename(columns= { 'round': 'GW' }, inplace=True)
    df_currentseason['game_date'] = df_currentseason['kickoff_time'].str.replace('T', ' ')
    df_currentseason['game_date'] = df_currentseason['game_date'].str.replace(':00Z', '')
    df_currentseason['game_date'] = pd.to_datetime(df_currentseason['game_date'])
    df_currentseason['game_weather'] = df_currentseason.game_date.dt.month.map(month_to_season) 
    df_currentseason['start_label'] = np.where((df_currentseason['game_date'].dt.hour) < 13, 0, 1)
    # Engineer feature tp highlight the game year only.
    df_currentseason['year'] = df_currentseason.game_date.dt.year
    df_currentseason.drop(['game_date', 'season_x'], axis=1, inplace=True)
    df_currentseason.drop(['opponent_team', 'fixture', 'kickoff_time'], axis=1, inplace=True)
    df_currentseason.form = df_currentseason.form.astype(float)
    df_currentseason.set_index('year', inplace=True)
    df_currentseason.drop(['element', 'name'], axis = 1, inplace=True)



    df_allseasons.to_csv('all_seasons_clean_hist.csv')
    df_currentseason.to_csv('df_test.csv')


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--allseasons')
    parser.add_argument('--hist')
    parser.add_argument('--currentseason')
    parser.add_argument('--players_df')
    parser.add_argument('--fixtures_df')
    args = parser.parse_args()
    feat_eng(args)
"""


In [11]:
feature_eng_path = "./feature_engineering"
if not os.path.exists(feature_eng_path):
    os.makedirs(feature_eng_path)

with open(f"{feature_eng_path}/feat_eng.py", 'w') as feature_eng_file:
    feature_eng_file.write(feature_eng_content)

In [12]:
feat_req_content = """
pandas
numpy
"""

In [13]:
# Writing the content to the feataure engineering requirements.txt file

feature_req_path = feature_eng_path

with open(f"{feature_req_path}/requirements.txt", 'w') as feat_req:
    feat_req.write(feat_req_content)

In [14]:
# Defining the content of the data preprocessing Dockerfile

feat_dockerfile_content = f"""
FROM python:3.10.4-slim-buster

WORKDIR /app

COPY {feature_eng_path}/requirements.txt /app
RUN pip install -r requirements.txt

COPY {feature_eng_path}/feat_eng.py /app
"""

In [15]:
# Writing the contents to the feature engineering Dockerfile

feat_dockerfile_path = feature_eng_path

with open(f"{feat_dockerfile_path}/Dockerfile", 'w') as feat_dockerfile:
    feat_dockerfile.write(feat_dockerfile_content)

In [16]:
# Building the feature engineering image and pushing it to the container registry

dockerfile_dir = feat_dockerfile_path

image_name = 'feature_engineering_fpl'
image_tag = 'latest'

feature_eng_image = docker_image_build(image_name, image_tag)

Sending build context to Docker daemon  44.83MB
Step 1/5 : FROM python:3.10.4-slim-buster
 ---> e00cda196d23
Step 2/5 : WORKDIR /app
 ---> Using cache
 ---> 875dc2c67660
Step 3/5 : COPY ./feature_engineering/requirements.txt /app
 ---> Using cache
 ---> 888e1977dd2f
Step 4/5 : RUN pip install -r requirements.txt
 ---> Using cache
 ---> 96428eaf41ee
Step 5/5 : COPY ./feature_engineering/feat_eng.py /app
 ---> Using cache
 ---> 21726b347be5
Successfully built 21726b347be5
Successfully tagged feature_engineering_fpl:latest
The push refers to repository [docker.io/pelvic/feature_engineering_fpl]
98dccb570335: Preparing
81ca3c379615: Preparing
5f627872c29a: Preparing
e189e6d19dae: Preparing
c41bbba2c89c: Preparing
361093c2629f: Preparing
69b6043419ca: Preparing
4bdae028fbe3: Preparing
10e6bc6fdee2: Preparing
361093c2629f: Waiting
69b6043419ca: Waiting
4bdae028fbe3: Waiting
10e6bc6fdee2: Waiting
c41bbba2c89c: Layer already exists
e189e6d19dae: Layer already exists
98dccb570335: Layer already

## ENCODING CATEGORICAL VARIABLES

In [38]:

# Creating the content of the encoding script

encoding_content = """
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
import argparse
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import os
import pickle
import numpy as np

def data_encoding(args):
    df_allseasons_final = pd.read_csv(args.train)
    df_allseasons_final.set_index('year', inplace=True)
    # Sort index (just in case).
    df_allseasons_final.sort_index(inplace=True)
    # Assign features and target variable.
    features = df_allseasons_final.drop(['total_points'], axis = 1)
    # target = df_allseasons_final['total_points']

    # Convert dataframe to a dictionary.
    features_dict = features.to_dict(orient='records')

    dv_final = DictVectorizer(sparse=False) 

    # sparse = False makes the output is not a sparse matrix.

    features_encoded = dv_final.fit_transform(features_dict)
    vocab_final = dv_final.vocabulary_
    features_transformed = pd.DataFrame(features_encoded, columns=dv_final.feature_names_)
    # Normalizing the train data.
    min_max_scaler_final = MinMaxScaler()

    # Fit scalar and transform train data.
    features_norm = min_max_scaler_final.fit_transform(features_transformed)

    data_path = './data_encoding/preprocessed_data'
    
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    np.save(f'{data_path}/features.npy', features_norm)

    if not os.path.exists('./model'):
        os.makedirs('./model')

    with open('./model/dv', 'wb') as f_out2:
        pickle.dump(dv_final, f_out2)

    with open('./model/min_max_scaler', 'wb') as f_out3:
        pickle.dump(min_max_scaler_final, f_out3)

    df_allseasons_final.to_csv('target.csv')

        

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--train')
    args = parser.parse_args()
    data_encoding(args) 
"""

In [39]:
# Writing the contents to the data_encoding.py

data_encoding_path = "./data_encoding"
if not os.path.exists(data_encoding_path):
    os.makedirs(data_encoding_path)

with open(f"{data_encoding_path}/data_encoding.py", 'w') as data_encode_file:
    data_encode_file.write(encoding_content)

In [40]:
# Defining the content of the requirement.txt file

encoding_req_content = """
sklearn
pandas
"""

In [41]:
# Writing the content to the data encoding requirements.txt file

encoding_req_path = data_encoding_path

with open(f"{encoding_req_path}/requirements.txt", 'w') as encoding_req:
    encoding_req.write(encoding_req_content)

In [42]:
# Defining the content of the data encoding Dockerfile

encoding_dockerfile_content = f"""
FROM python:3.10.4-slim-buster

WORKDIR /app

COPY {data_encoding_path}/requirements.txt /app
RUN pip install -r requirements.txt

COPY {data_encoding_path}/data_encoding.py /app
"""

In [43]:
# Writing the contents to the model training Dockerfile

encoding_dockerfile_path = data_encoding_path

with open(f"{encoding_dockerfile_path}/Dockerfile", 'w') as encoding_dockerfile:
    encoding_dockerfile.write(encoding_dockerfile_content)

In [44]:
# Building the model training image and pushing it to the container registry

dockerfile_dir = encoding_dockerfile_path

image_name = 'data-encoding-fpl'
image_tag = 'latest'

data_encoding_image = docker_image_build(image_name, image_tag)

Sending build context to Docker daemon  44.84MB
Step 1/5 : FROM python:3.10.4-slim-buster
 ---> e00cda196d23
Step 2/5 : WORKDIR /app
 ---> Using cache
 ---> 875dc2c67660
Step 3/5 : COPY ./data_encoding/requirements.txt /app
 ---> Using cache
 ---> 29a16b63819d
Step 4/5 : RUN pip install -r requirements.txt
 ---> Using cache
 ---> 3921c117b536
Step 5/5 : COPY ./data_encoding/data_encoding.py /app
 ---> 6ae6f56103cf
Successfully built 6ae6f56103cf
Successfully tagged data-encoding-fpl:latest
The push refers to repository [docker.io/pelvic/data-encoding-fpl]
153ca746b344: Preparing
e1a01a7715d4: Preparing
07c546a44f44: Preparing
e189e6d19dae: Preparing
c41bbba2c89c: Preparing
361093c2629f: Preparing
69b6043419ca: Preparing
4bdae028fbe3: Preparing
361093c2629f: Waiting
69b6043419ca: Waiting
10e6bc6fdee2: Preparing
4bdae028fbe3: Waiting
10e6bc6fdee2: Waiting
c41bbba2c89c: Layer already exists
e189e6d19dae: Layer already exists
07c546a44f44: Layer already exists
e1a01a7715d4: Layer already e

## MODEL TRAINING AND TESTING

In [52]:

# Creating the content of the model training script

model_train_content = """
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn import linear_model
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import r2_score, mean_squared_error
import argparse
import pandas as pd
import numpy as np
import pickle

def model_training(args):
    df_test = pd.read_csv(args.test)
    df_test.set_index('year', inplace=True)
    print(df_test.head())
    df_allseasons_final = pd.read_csv(args.target)
    print(df_allseasons_final.head())
    target = df_allseasons_final['total_points']
    features_norm = np.load(args.features_path, allow_pickle=True)
    df_test_dict = df_test.to_dict(orient='records')
    rf = RandomForestRegressor(random_state=2)
    final_model = rf.fit(features_norm, target)

    with open(args.dv_path, 'rb') as f_in1:
        dv = pickle.load(f_in1)
    with open(args.scaler_path, 'rb') as f_in2:
        scaler = pickle.load(f_in2)

    test_encoded = dv.transform(df_test_dict)
    vocab = dv.vocabulary_
    test_transformed = pd.DataFrame(test_encoded, columns=dv.feature_names_)
    test_norm = scaler.transform(test_transformed)
    print(test_norm)
    predicted = final_model.predict(test_norm)
    print(predicted[0:11])
    # df_predicted = pd.Series(predicted)
    RSME_score = mean_squared_error(y_true=df_test['total_points'], y_pred=predicted, squared=False) #squared=False will RMSE instead of MSE
    R2_score = r2_score(df_test['total_points'], predicted)

    print('RMSE:', RSME_score)
    print('R-Squared:', R2_score)
    print()

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--features_path')
    parser.add_argument('--target')
    parser.add_argument('--test')
    parser.add_argument('--dv_path')
    parser.add_argument('--scaler_path')
    args = parser.parse_args()
    model_training(args) 
"""

In [53]:
# Writing the contents to the model_train.py

model_train_path = "./model_training"
if not os.path.exists(model_train_path):
    os.makedirs(model_train_path)

with open(f"{model_train_path}/model_train.py", 'w') as model_train_file:
    model_train_file.write(model_train_content)

In [54]:
# Defining the content of the requirement.txt file

train_req_content = """
pandas
sklearn
"""

In [55]:
# Writing the content to the model training requirements.txt file

train_req_path = model_train_path

with open(f"{train_req_path}/requirements.txt", 'w') as train_req:
    train_req.write(train_req_content)

In [56]:
# Defining the content of the model training Dockerfile

train_dockerfile_content = f"""
FROM python:3.10.4-slim-buster

WORKDIR /app

COPY {model_train_path}/requirements.txt /app
RUN pip install -r requirements.txt

COPY {model_train_path}/model_train.py /app
"""

In [57]:
# Writing the contents to the model training Dockerfile

train_dockerfile_path = model_train_path

with open(f"{train_dockerfile_path}/Dockerfile", 'w') as train_dockerfile:
    train_dockerfile.write(train_dockerfile_content)

In [58]:
# Building the model training image and pushing it to the container registry

dockerfile_dir = train_dockerfile_path

image_name = 'model_train-fpl'
image_tag = 'latest'

model_train_image = docker_image_build(image_name, image_tag)

Sending build context to Docker daemon  44.84MB
Step 1/5 : FROM python:3.10.4-slim-buster
 ---> e00cda196d23
Step 2/5 : WORKDIR /app
 ---> Using cache
 ---> 875dc2c67660
Step 3/5 : COPY ./model_training/requirements.txt /app
 ---> Using cache
 ---> 1294e7db94df
Step 4/5 : RUN pip install -r requirements.txt
 ---> Using cache
 ---> 2264788c19c6
Step 5/5 : COPY ./model_training/model_train.py /app
 ---> 95aef0398f53
Successfully built 95aef0398f53
Successfully tagged model_train-fpl:latest
The push refers to repository [docker.io/pelvic/model_train-fpl]
62c2965047f4: Preparing
409ad426b17f: Preparing
4663a8c45f5b: Preparing
e189e6d19dae: Preparing
c41bbba2c89c: Preparing
361093c2629f: Preparing
69b6043419ca: Preparing
4bdae028fbe3: Preparing
10e6bc6fdee2: Preparing
69b6043419ca: Waiting
4bdae028fbe3: Waiting
10e6bc6fdee2: Waiting
361093c2629f: Waiting
4663a8c45f5b: Layer already exists
c41bbba2c89c: Layer already exists
e189e6d19dae: Layer already exists
409ad426b17f: Layer already exist

In [32]:
os.system('pip install kfp')



0

In [122]:
# Compile the components together

import kfp
from kfp import dsl

def data_download_op():
    return dsl.ContainerOp(
        name='Download Data',
        image=f'{data_download_image}',
        arguments=[],
        command=["python", "data_download.py"],
        file_outputs={
            'allseasons': '/app/all_seasons_hist.csv',
            'hist': '/app/hist_data.csv',
            'players': '/app/players_df.csv',
            'currentseason': '/app/currentseason.csv',
            'fixtures': '/app/fixtures.csv',
        }
    )

def feature_eng_op(allseasons, hist,  players_df, currentseason, fixtures_df):
    return dsl.ContainerOp(
        name='Feature Engineering',
        image=f'{feature_eng_image}',
        arguments=[
            '--allseasons', allseasons,
            '--hist', hist,
            '--players_df', players_df,
            '--currentseason', currentseason,
            '--fixtures_df', fixtures_df
        ],
        command=["python", "feat_eng.py"],
        file_outputs={
            'train': '/app/all_seasons_clean_hist.csv',
            'test': '/app/df_test.csv',
        }
    )

def data_encoding_op(train):
    return dsl.ContainerOp(
        name='Data Encoding',
        image=f'{data_encoding_image}',
        arguments=[
            '--train', train
        ],
        command=["python", "data_encoding.py"],
        file_outputs={
            'feature_path': '/app/data_encoding/preprocessed_data/features.npy',
            'target': '/app/target.csv',
            'dv':'/app/model/dv',
            'scaler' : '/app/model/min_max_scaler',
        }
    )

def model_train_op(features_path, target, dv_path, scaler_path, test):
    return dsl.ContainerOp(
        name='Train Model',
        image=f'{model_train_image}',
        arguments=[
            '--features_path', features_path,
            '--target',target,
            '--dv_path', dv_path,
            '--scaler_path', scaler_path,
            '--test', test
        ],
        command=["python", "model_train.py"],
    )
    


In [123]:
@dsl.pipeline(
   name='REGRESSION ML workflow pipeline',
   description='A pipeline for a regressor job for predicting the total points in FPL'
)
def FPL_pipeline():
    _data_download_op = data_download_op()

    _feature_eng_op = feature_eng_op(
        dsl.InputArgumentPath(_data_download_op.outputs['allseasons']),
        dsl.InputArgumentPath(_data_download_op.outputs['hist']),
        dsl.InputArgumentPath(_data_download_op.outputs['players']),
        dsl.InputArgumentPath(_data_download_op.outputs['currentseason']),
        dsl.InputArgumentPath(_data_download_op.outputs['fixtures']),
    ).after(_data_download_op)

    _data_encoding_op = data_encoding_op(
        dsl.InputArgumentPath(_feature_eng_op.outputs['train']),
    ).after(_feature_eng_op)
   
    _model_train_op = model_train_op(
        dsl.InputArgumentPath(_data_encoding_op.outputs['feature_path']),
        dsl.InputArgumentPath(_data_encoding_op.outputs['target']),
        dsl.InputArgumentPath(_data_encoding_op.outputs['dv']),
        dsl.InputArgumentPath(_data_encoding_op.outputs['scaler']),
        dsl.InputArgumentPath(_feature_eng_op.outputs['test']),
    ).after(_data_encoding_op)

if __name__ == '__main__':
    kfp.compiler.Compiler().compile(FPL_pipeline, 'FPL_pipeline.yaml')