In [3]:
import pandas as pd
import numpy as np
import os

In [None]:
# scripts/preprocess.py
import pandas as pd

def load_data():
    df = pd.read_csv("data/athletes.csv")
    print(df.head())
    return df


In [None]:
data = load_data()
from datetime import datetime

# Cleaning as before
data = data.dropna(subset=['region','age','weight','height','howlong','gender','eat',
                           'train','background','experience','schedule','deadlift',
                           'candj','snatch','backsq'])
data = data.drop(columns=['affiliate','team','name','athlete_id','fran','helen','grace',
                          'filthy50','fgonebad','run400','run5k','pullups','train'], errors='ignore')
data = data[data['weight'] < 1500]
data = data[data['gender'] != '--']
data = data[data['age'] >= 18]
data = data[(data['height'] < 96) & (data['height'] > 48)]
data = data[(data['deadlift'] > 0) & ((data['deadlift'] <= 1105) | ((data['gender'] == 'Female') & (data['deadlift'] <= 636)))]
data = data[(data['candj'] > 0) & (data['candj'] <= 395)]
data = data[(data['snatch'] > 0) & (data['snatch'] <= 496)]
data = data[(data['backsq'] > 0) & (data['backsq'] <= 1069)]

decline_dict = {'Decline to answer|': np.nan}
data = data.replace(decline_dict)
data = data.dropna(subset=['background','experience','schedule','howlong','eat'])

# Add timestamp
data["timestamp"] = datetime.now()

# Save cleaned file for Feast source
data.to_csv("data/athletes.csv", index=False)


In [2]:
# features/feature_repo/example_feature_view.py

from feast import FeatureView, Field
from feast.types import Float32, String
from feast.infra.offline_stores.file_source import FileSource
from datetime import timedelta

athletes_source = FileSource(
    path="../data/athletes.csv",
    event_timestamp_column="timestamp"
)

athletes_view_v1 = FeatureView(
    name="athlete_features_v1",
    ttl=timedelta(days=365),
    entities=[],
    schema=[
        Field(name="height", dtype=Float32),
        Field(name="weight", dtype=Float32),
    ],
    source=athletes_source,
)

athletes_view_v2 = FeatureView(
    name="athlete_features_v2",
    ttl=timedelta(days=365),
    entities=[],
    schema=[
        Field(name="height", dtype=Float32),
        Field(name="weight", dtype=Float32),
        Field(name="age", dtype=Float32),
    ],
    source=athletes_source,
)


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [3]:
import os
os.getcwd()

'd:\\UCHICAGO\\UChicago Courses\\Machine Learning Operations\\Week3\\mlops_assignment2\\venv\\notebooks'

In [7]:
os.chdir("d:\\UCHICAGO\\UChicago Courses\\Machine Learning Operations\\Week3\\mlops_assignment2\\venv\\features\\feature_repo\\feature_repo")

In [8]:
!feast apply

Applying changes for project feature_repo
Created project feature_repo
Created entity driver
Created feature view driver_hourly_stats_fresh
Created feature view driver_hourly_stats
Created on demand feature view transformed_conv_rate_fresh
Created on demand feature view transformed_conv_rate
Created feature service driver_activity_v1
Created feature service driver_activity_v3
Created feature service driver_activity_v2

Created sqlite table feature_repo_driver_hourly_stats_fresh
Created sqlite table feature_repo_driver_hourly_stats



None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
  path="data\driver_stats.parquet",
  driver = Entity(name="driver", join_keys=["driver_id"])


In [None]:
store = FeatureStore(repo_path="features/feature_repo/feature_repo/")
version = "v1"  # or "v1"

feature_view = f"athlete_features_{version}"

entity_df = pd.read_csv("data/athletes.csv")[["timestamp", "deadlift"]]  # must include label

data = store.get_historical_features(
    entity_df=entity_df,
    features=store.get_feature_view(feature_view).features
).to_df()

In [18]:
data.to_csv("df.csv", index=False)

In [14]:
data.columns

Index(['region', 'gender', 'age', 'height', 'weight', 'candj', 'snatch',
       'deadlift', 'backsq', 'eat', 'background', 'experience', 'schedule',
       'howlong'],
      dtype='object')

###  Train.py file which we will call from terminal

In [None]:
# scripts/train.py

import argparse
import mlflow
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from codecarbon import EmissionsTracker


def load_and_preprocess_data():
    # Load raw data
    data = pd.read_csv("data/athletes.csv")

    # Drop rows with missing values in key columns
    data = data.dropna(subset=[
        'region', 'age', 'weight', 'height', 'howlong', 'gender', 'eat',
        'background', 'experience', 'schedule', 'deadlift', 'candj',
        'snatch', 'backsq'
    ])

    # Drop unused columns
    # data = data.drop(columns=[
    #     'affiliate', 'team', 'name', 'athlete_id', 'fran', 'helen', 'grace',
    #     'filthy50', 'fgonebad', 'run400', 'run5k', 'pullups', 'train'
    # ], errors='ignore')

    # # Remove outliers and incorrect data
    # data = data[data['weight'] < 1500]
    # data = data[data['gender'] != '--']
    # data = data[data['age'] >= 18]
    # data = data[(data['height'] < 96) & (data['height'] > 48)]

    # data = data[(data['deadlift'] > 0) & (
    #     (data['deadlift'] <= 1105) | ((data['gender'] == 'Female') & (data['deadlift'] <= 636))
    # )]
    # data = data[(data['candj'] > 0) & (data['candj'] <= 395)]
    # data = data[(data['snatch'] > 0) & (data['snatch'] <= 496)]
    # data = data[(data['backsq'] > 0) & (data['backsq'] <= 1069)]

    # # Replace and drop declined survey answers
    # decline_dict = {'Decline to answer|': np.nan}
    # data = data.replace(decline_dict)
    # data = data.dropna(subset=['background', 'experience', 'schedule', 'howlong', 'eat'])

    return data.copy()


def train_model(df, version, alpha, features):
    mlflow.set_tracking_uri("file:./mlruns")
    mlflow.set_experiment("athlete_experiment")

    # Define target and features
    y = df["deadlift"]
    X = df[features]

    # Determine which columns are numeric or categorical
    numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

    # Create preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ]
    )

    # Full pipeline: preprocessing + Ridge regression
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', Ridge(alpha=alpha))
    ])

    tracker = EmissionsTracker()
    tracker.start()

    try:
        with mlflow.start_run():
            pipeline.fit(X, y)
            preds = pipeline.predict(X)
            mse = mean_squared_error(y, preds)

            mlflow.log_param("feature_version", version)
            mlflow.log_param("alpha", alpha)
            mlflow.log_param("features", features)
            mlflow.log_metric("mse", mse)

            mlflow.sklearn.log_model(pipeline, "pipeline_model")

            emissions = tracker.stop()
            mlflow.log_metric("carbon_emissions_kg", emissions)

    except Exception as e:
        print(f"An error occurred: {e}")
        tracker.stop()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--version", type=str, required=True)
    parser.add_argument("--alpha", type=float, required=True)
    parser.add_argument("--features", nargs="+", required=True)
    args = parser.parse_args()

    df = load_and_preprocess_data()
    train_model(df, args.version, args.alpha, args.features)


In [17]:
os.getcwd()

'd:\\UCHICAGO\\UChicago Courses\\Machine Learning Operations\\Week3\\mlops_assignment2\\venv'

In [4]:
os.chdir("d:\\UCHICAGO\\UChicago Courses\\Machine Learning Operations\\Week3\\mlops_assignment2\\venv\\")

In [5]:
!python mlflow_scripts/train.py --version v1 --alpha 0.5 --features height weight

2025/07/15 19:46:37 INFO mlflow.tracking.fluent: Experiment with name 'athlete_experiment2' does not exist. Creating a new experiment.
[codecarbon INFO @ 19:46:37] [setup] RAM Tracking...
[codecarbon INFO @ 19:46:37] [setup] CPU Tracking...
 Windows OS detected: Please install Intel Power Gadget to measure CPU

[codecarbon INFO @ 19:46:39] CPU Model on constant consumption mode: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
[codecarbon INFO @ 19:46:39] [setup] GPU Tracking...
[codecarbon INFO @ 19:46:39] No GPU found.
[codecarbon INFO @ 19:46:39] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 19:46:39] >>> Tracker's metadata:
[codecarbon INFO @ 19:46:39]   Platform system: Windows-11-10.0.26100-SP0
[codecarbon INFO @ 19:46:39]   Python version: 3.12.3
[codecarbon INFO @ 19:46:39]   CodeCarbon

In [20]:
!python mlflow_scripts/train.py --version v1 --alpha 0.5 --features height weight

[codecarbon INFO @ 21:43:42] [setup] RAM Tracking...
[codecarbon INFO @ 21:43:42] [setup] CPU Tracking...
 Windows OS detected: Please install Intel Power Gadget to measure CPU

[codecarbon INFO @ 21:43:44] CPU Model on constant consumption mode: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
[codecarbon INFO @ 21:43:44] [setup] GPU Tracking...
[codecarbon INFO @ 21:43:44] No GPU found.
[codecarbon INFO @ 21:43:44] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 21:43:44] >>> Tracker's metadata:
[codecarbon INFO @ 21:43:44]   Platform system: Windows-11-10.0.26100-SP0
[codecarbon INFO @ 21:43:44]   Python version: 3.12.3
[codecarbon INFO @ 21:43:44]   CodeCarbon version: 3.0.2
[codecarbon INFO @ 21:43:44]   Available RAM : 31.709 GB
[codecarbon INFO @ 21:43:44]   CPU count: 8 thread(s) in 8 phy

In [14]:
!python mlflow_scripts/train.py --version v1 --alpha 1.0 --features height weight

[codecarbon INFO @ 21:33:02] [setup] RAM Tracking...
[codecarbon INFO @ 21:33:02] [setup] CPU Tracking...
 Windows OS detected: Please install Intel Power Gadget to measure CPU

[codecarbon INFO @ 21:33:04] CPU Model on constant consumption mode: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
[codecarbon INFO @ 21:33:04] [setup] GPU Tracking...
[codecarbon INFO @ 21:33:04] No GPU found.
[codecarbon INFO @ 21:33:04] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 21:33:04] >>> Tracker's metadata:
[codecarbon INFO @ 21:33:04]   Platform system: Windows-11-10.0.26100-SP0
[codecarbon INFO @ 21:33:04]   Python version: 3.12.3
[codecarbon INFO @ 21:33:04]   CodeCarbon version: 3.0.2
[codecarbon INFO @ 21:33:04]   Available RAM : 31.709 GB
[codecarbon INFO @ 21:33:04]   CPU count: 8 thread(s) in 8 phy

In [15]:
!python mlflow_scripts/train.py --version v1 --alpha 0.5 --features height weight age

[codecarbon INFO @ 21:33:22] [setup] RAM Tracking...
[codecarbon INFO @ 21:33:22] [setup] CPU Tracking...
 Windows OS detected: Please install Intel Power Gadget to measure CPU

[codecarbon INFO @ 21:33:23] CPU Model on constant consumption mode: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
[codecarbon INFO @ 21:33:23] [setup] GPU Tracking...
[codecarbon INFO @ 21:33:23] No GPU found.
[codecarbon INFO @ 21:33:23] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 21:33:23] >>> Tracker's metadata:
[codecarbon INFO @ 21:33:23]   Platform system: Windows-11-10.0.26100-SP0
[codecarbon INFO @ 21:33:23]   Python version: 3.12.3
[codecarbon INFO @ 21:33:23]   CodeCarbon version: 3.0.2
[codecarbon INFO @ 21:33:23]   Available RAM : 31.709 GB
[codecarbon INFO @ 21:33:23]   CPU count: 8 thread(s) in 8 phy

In [16]:
!python mlflow_scripts/train.py --version v1 --alpha 1.0 --features height weight age

[codecarbon INFO @ 21:36:20] [setup] RAM Tracking...
[codecarbon INFO @ 21:36:20] [setup] CPU Tracking...
 Windows OS detected: Please install Intel Power Gadget to measure CPU

[codecarbon INFO @ 21:36:21] CPU Model on constant consumption mode: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
[codecarbon INFO @ 21:36:21] [setup] GPU Tracking...
[codecarbon INFO @ 21:36:21] No GPU found.
[codecarbon INFO @ 21:36:21] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 21:36:21] >>> Tracker's metadata:
[codecarbon INFO @ 21:36:21]   Platform system: Windows-11-10.0.26100-SP0
[codecarbon INFO @ 21:36:21]   Python version: 3.12.3
[codecarbon INFO @ 21:36:21]   CodeCarbon version: 3.0.2
[codecarbon INFO @ 21:36:21]   Available RAM : 31.709 GB
[codecarbon INFO @ 21:36:21]   CPU count: 8 thread(s) in 8 phy

In [1]:
#!mlflow ui

###  Adding age with height and weight reduces MSE and also 0.5 alpha is the best in terms of carbon emissions and MSE