# data.py

In [None]:
def get_data():
    url = 's3://wagon-public-datasets/taxi-fare-train.csv'
    df = pd.read_csv(url, nrows=100)

    return df

def get_data_from_gcp():

    data_file = 'data/data_from_gcp.csv'

    client = storage.Client().bucket(BUCKET_NAME)

    blob = client.blob(BUCKET_TRAIN_DATA_PATH)

    blob.download_to_filename(data_file)

    df = pd.read_csv(data_file)
    return df

# setup.py

In [None]:
from setuptools import find_packages
from setuptools import setup

REQUIRED_PACKAGES = [
    # training
    'numpy==1.18.4',
    'pandas==0.24.2',
    'scikit-learn==0.20.4',
    'joblib==0.14.1',
    # tracking
    'memoized-property==1.0.3',
    'mlflow==1.8.0',
    # storage
    's3fs==0.4.2',
    'gcsfs==0.6.0',
    'google-cloud-storage==1.26.0',
    # logs
    'termcolor==1.1.0']

setup(
    name='TaxiFareModel450',
    version='1.0',
    install_requires=REQUIRED_PACKAGES,
    packages=find_packages(),
    include_package_data=True,
    description='Taxi Fare Prediction Pipeline 450'
)

# params.py

In [None]:
# bucket
BUCKET_NAME = 'le-wagon-data-gmanchon-batch-414'

# train dataset
BUCKET_TRAIN_DATA_PATH = 'data/train_1k.csv'

# test dataset
# BUCKET_TRAIN_DATA_PATH = 'data_for_prediction.csv'

# joblib storage path
MODEL_BASE_PATH = 'taxifare_450'

# gcp.py

In [None]:
from Taxifare.params import BUCKET_NAME, MODEL_BASE_PATH

from google.cloud import storage

def upload_model_to_gcp(local_model_filename, gcp_model_path):

    # models/taxifare_450/.../model.joblib
    storage_location = 'models_450/{}/{}/{}'.format(
        MODEL_BASE_PATH,
        gcp_model_path,
        'model.joblib')

    client = storage.Client().bucket(BUCKET_NAME)

    blob = client.blob(storage_location)

    blob.upload_from_filename(local_model_filename)

# Makefile

In [None]:
# bucket
BUCKET_NAME=le-wagon-data-gmanchon-batch-414

# training folder
BUCKET_TRAINING_FOLDER=trainings

# training params
REGION=europe-west1

# app environment
PYTHON_VERSION=3.7
FRAMEWORK=scikit-learn
RUNTIME_VERSION=1.15

# package params
PACKAGE_NAME=Taxifare
FILENAME=trainer

# pred
# PRED_FILENAME=predict

##### Job - - - - - - - - - - - - - - - - - - - - - - - - -

JOB_NAME=taxi_fare_training_pipeline_450_$(shell date +'%Y%m%d_%H%M%S')

gcp_submit_training:
	gcloud ai-platform jobs submit training ${JOB_NAME} \
		--job-dir gs://${BUCKET_NAME}/${BUCKET_TRAINING_FOLDER} \
		--package-path ${PACKAGE_NAME} \
		--module-name ${PACKAGE_NAME}.${FILENAME} \
		--python-version=${PYTHON_VERSION} \
		--runtime-version=${RUNTIME_VERSION} \
    --config config.yaml \
		--region ${REGION} \
		--stream-logs

predict:
	python ${PRED_FILENAME}.py