In [1]:
import os
import warnings
import sys
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import plotly
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import random
from random import choices
from string import ascii_lowercase, digits
import datetime
from pathlib import Path
from functools import partial
from itertools import starmap
from dotenv import load_dotenv
import requests

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

import mlflow
from mlflow import MlflowClient
import mlflow.sklearn
from mlflow.models import infer_signature
from mlflow.models import Model
from mlflow.data.pandas_dataset import PandasDataset

# set mlflow traking uri
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

In [2]:
this_dir = pathlib.Path()
parent_dir = this_dir.resolve().parent
data_dir = this_dir / "data"

# create results directory, if not already existing
cwd = os.getcwd()
results_dir = os.path.join(cwd, r'baseline_results')
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

# Implement a simple recommender algorithm as linear regression

USING THE EXISTING FUNCTIONS, modify the existing train function to train a linear regression RECOMMENDER MODEL, then train/test the RECOMMENDER MODEL using the same data.

Resource: https://medium.com/intro-to-artificial-intelligence/recommendation-engine-algorithm-collaborative-filtering-d1c837eaadfb

# Eval and plotting functions

In [3]:
# MODEL EVALUATION FUNCTIONS #

def eval_metrics(actual, pred):
        rmse = np.sqrt(mean_squared_error(actual, pred))
        mae = mean_absolute_error(actual, pred)
        r2 = r2_score(actual, pred)
        return rmse, mae, r2

# Data Prep

In [4]:
# Prepare data
def data_prep(filename='training_data_cumulative.csv'):
# Read and log the input data 
    data_filepath = data_dir / filename
    data = pd.read_csv(data_filepath)
    data_artifact = mlflow.data.from_pandas(data)  # log when run is started
    
    
    ## DATA PREPROCESSING STEPS ##
    
    # create target column
    data["hardware_unencoded"] = list(zip(data['# of cores'], data['memory (gb)']))
    data['hardware_as_strings'] = data['hardware_unencoded'].apply(lambda x: ', '.join(map(str, x)))
    
    # encode targets
    label_encoder = LabelEncoder()
    data['hardware'] = label_encoder.fit_transform(data['hardware_as_strings'])

    
    # remove noisy features 
    feature_cols = data[["area", "wind_speed", "wind_direction", "canopy_moisture", "surface_moisture"]]
    df = pd.concat([feature_cols, data['hardware']], axis=1)
    
    print(df.columns)
    # Replace infty values with NaN
    df.replace(['inf', np.inf, -np.inf], np.nan, inplace=True)

    # Impute NaN values with mean
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    sample_data = df.sample(frac=0.25).dropna()
    imp_mean.fit(df)
    df = pd.DataFrame(imp_mean.fit_transform(df), columns = df.columns)
    
    # Encode categorical features
    for col_name in feature_cols.columns.tolist():
        df[col_name] = df[col_name].astype('category').cat.codes
    
    return df, data_artifact
    
    
def data_split(df, seed=None, num_samples=None, autologging=True):
    # Determine number of samples to take from data
    if num_samples is not None:
        train = df.sample(n=num_samples, random_state=seed)
        df = df[~df.index.isin(train.index)]
    else:
        train = df
    
    # Split the data into training and testing (.8, .2) split
    test_size = round(df.shape[0]*0.2)
    train_temp, test = train_test_split(df, test_size=test_size, random_state=seed, shuffle=True)
    
    return train, test

# Training

In [5]:
## DEFINE MODEL TRAINING FUNCTIONS##
# TODO: Separate preprocessing and training/evaluation

def train_recommender(train, test, data_artifact, experiment_id, seed=None, num_epochs=None, autologging=True):
    mlflow.autolog()  # enable autologging
    mlflow.sklearn.autolog()


    # The target column
    train_x = train.drop(["hardware"], axis=1)
    test_x = test.drop(["hardware"], axis=1)
    train_y = train[["hardware"]]
    test_y = test[["hardware"]]

    
    ## storage settings ##
    model_type = 'LinearRegression'
    now = datetime.datetime.now().strftime("%Y_%m_%d_%I%M%S%p")
    run_name = model_type + '_' + now
    
    
    ## MLFLOW RUN ##
    # useful for multiple runs
    with mlflow.start_run(experiment_id=experiment_id, run_name=run_name, nested=True, log_system_metrics=True):
        
#         e_start = 1.0
#         e_decay = 0.99
        
#         for epoch in range(0, num_epochs):
#             # Decay epsilon
#             epsilon = max(e_start * e_decay, 0)
        
        # Log data
        mlflow.log_input(data_artifact, "input")
        train_dataset = mlflow.data.from_pandas(train, targets="hardware", source="data.csv")
        mlflow.log_input(train_dataset, context="training")


        # Execute recommender system
        model = LinearRegression()
        model.fit(train_x, train_y)


        # Evaluate Metrics
        pred_y = model.predict(test_x)
        (rmse, mae, r2) = eval_metrics(test_y, pred_y)

        
        
        # log evalmetrics
        mlflow.log_metric("eval_rmse", rmse)
        mlflow.log_metric("eval_r2", r2)
        mlflow.log_metric("eval_mae", mae)

        # Print metrics
        # print("LinearRegression model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
        # print("  RMSE: %s" % rmse)
        # print("  MAE: %s" % mae)
        # print("  R2: %s" % r2)



        # Create model artifact directory
        # cwd = os.getcwd()
        artifacts_dir = os.path.join(results_dir, run_name)
        if not os.path.exists(artifacts_dir):
            os.makedirs(artifacts_dir)
            
            
        # SET AND LOG PARAMETERS # 
        # (some model parameters will be autologged mlflow but not optuna)
        mlflow.log_param("model_seed", seed)


        ## IF AUTOLOGGING IS NOT ENABLED ##
        if autologging==False:
            
            # Log and download artifacts locally --- file upload not yet supported in NDP JupyterHub
            active_run = mlflow.active_run()
            mlflow.artifacts.download_artifacts(run_id = active_run.info.run_id, dst_path=artifacts_dir)
            mlflow.sklearn.log_model(sk_model=lr, input_example=test_x, artifact_path=artifacts_dir)
        
        


    mlflow.end_run()   # END CURRENT RUN BEFORE STARTING NEW RUN

In [6]:
def get_best_model(experiment_id=None):
    
    # Opt. for now: get best run
    best_run = mlflow.search_runs(
        experiment_id, order_by=["metrics.eval_rmse"], max_results=1
    )
    print(best_run.info)
    
    return None

# Run experiments

In [7]:
## EXPERIMENT FUNCTION ##
def run_experiment(num_runs=10, train_func=train_recommender, num_samples=None, num_epochs=None, experiment_name=None):
    
    
    # Generate experiment name
    random_suffix = "".join(choices(ascii_lowercase, k=2)+choices(digits, k=3))
    experiment_name = 'Recommender_'+random_suffix
        
    # create experiment (if not existing)
    try:
        mlflow.create_experiment(experiment_name)
    except:
        pass
    
    mlflow.set_experiment(experiment_name)

    
    # Get experiment ID
    experiment = mlflow.get_experiment_by_name(experiment_name)
    experiment_id = experiment.experiment_id
    
    # prep data
    df, data_artifact = data_prep()
    
    # Run experiments on train and test data
    
    for i in range(num_runs):
        # Generate random seed
        #warnings.filterwarnings("ignore")
        np_max_int = np.iinfo(np.int32).max
        seed = np.random.randint(np_max_int)
        np.random.seed(seed)
        
        # Generate train and test data
        train, test = data_split(df, num_samples=num_samples, seed=seed)
        
        # train recommender
        train_recommender(train, test, data_artifact, experiment_id, seed=seed)
        
    
    
    return experiment_id

In [8]:
# experiment = run_experiment(num_runs=100)

In [9]:
# get_best_model(experiment)

In [10]:
experiment_fewer_data = run_experiment(num_runs=10, num_samples=25)

2024/10/15 09:17:28 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.


Index(['area', 'wind_speed', 'wind_direction', 'canopy_moisture',
       'surface_moisture', 'hardware'],
      dtype='object')


2024/10/15 09:17:29 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/10/15 09:17:29 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
1       (2, 16)
2       (2, 16)
3       (2, 16)
4       (2, 16)
         ...   
1369    (4, 16)
1370    (4, 16)
1371    (4, 16)
1372    (4, 16)
1373    (4, 16)
Name: hardware_unencoded, Length: 1374, dtype: object. Error: Data (2, 16) is not one of the supported DataType
  return _dataset_source_registry.resolve(
2024/10/15 09:17:30 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/15 09:17:30 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
2024/10/15 09:17:30 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2024/10/15 09:17:30 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
  return _dataset_source_registry.resolve(


2024/10/15 09:17:31 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/15 09:17:31 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
2024/10/15 09:17:31 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2024/10/15 09:17:31 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
  return _dataset_source_registry.resolve(
2024/10/15 09:17:32 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/15 09:17:32 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
2024/10/15 09:17:32 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2024/10/15 09:17:32 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
  return _dataset_source_registry.resolve(


2024/10/15 09:17:33 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/15 09:17:33 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
2024/10/15 09:17:33 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2024/10/15 09:17:33 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
  return _dataset_source_registry.resolve(
2024/10/15 09:17:34 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/15 09:17:34 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
2024/10/15 09:17:34 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2024/10/15 09:17:34 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
  return _dataset_source_registry.resolve(


2024/10/15 09:17:35 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/15 09:17:35 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
2024/10/15 09:17:35 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2024/10/15 09:17:35 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
  return _dataset_source_registry.resolve(
2024/10/15 09:17:36 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/15 09:17:36 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
2024/10/15 09:17:36 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2024/10/15 09:17:36 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
  return _dataset_source_registry.resolve(


2024/10/15 09:17:37 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/15 09:17:37 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
2024/10/15 09:17:37 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2024/10/15 09:17:37 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
  return _dataset_source_registry.resolve(
2024/10/15 09:17:37 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/15 09:17:37 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
2024/10/15 09:17:37 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2024/10/15 09:17:37 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
  return _dataset_source_registry.resolve(


2024/10/15 09:17:39 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/15 09:17:39 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [11]:
best_model = get_best_model(experiment_fewer_data)

<bound method DataFrame.info of                              run_id       experiment_id    status  \
0  4b7e8db267cb464597a66fc3f27e7a20  364283543281763039  FINISHED   

                                        artifact_uri  \
0  mlflow-artifacts:/364283543281763039/4b7e8db26...   

                        start_time                         end_time  \
0 2024-10-15 16:17:33.420000+00:00 2024-10-15 16:17:34.281000+00:00   

   metrics.training_root_mean_squared_error  metrics.training_score  \
0                                  0.800455                0.102119   

   metrics.eval_r2  metrics.eval_mae  ...  params.positive  params.copy_X  \
0        -0.115064          0.691251  ...            False           True   

   params.n_jobs                      tags.mlflow.log-model.history  \
0           None  [{"run_id": "4b7e8db267cb464597a66fc3f27e7a20"...   

  tags.mlflow.source.type tags.mlflow.user tags.estimator_name  \
0                   LOCAL             Hena    LinearRegression   


In [12]:
df_runs = mlflow.search_runs(experiment_fewer_data, max_results=100)

In [13]:
eval_rmse = df_runs["metrics.eval_rmse"]

In [14]:
df_runs.describe()

Unnamed: 0,metrics.training_root_mean_squared_error,metrics.training_score,metrics.eval_r2,metrics.eval_mae,metrics.training_mean_absolute_error,metrics.eval_rmse,metrics.training_mean_squared_error,metrics.training_r2_score
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.710856,0.195847,-0.283223,0.757559,0.606901,0.901578,0.509015,0.195847
std,0.064099,0.083286,0.137305,0.052419,0.071897,0.06164,0.091665,0.083286
min,0.609278,0.102119,-0.489607,0.691251,0.497414,0.818368,0.37122,0.102119
25%,0.681405,0.139218,-0.368572,0.718434,0.580799,0.863182,0.464362,0.139218
50%,0.717272,0.175143,-0.257067,0.746906,0.602252,0.890191,0.514483,0.175143
75%,0.725784,0.211961,-0.245934,0.786022,0.64454,0.940263,0.526777,0.211961
max,0.813835,0.365591,-0.077177,0.846002,0.714879,1.005064,0.662328,0.365591
