In [34]:
import mlflow
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [2]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

https://archive.ics.uci.edu/dataset/449/optical+interconnection+network

In [3]:
data_path = "data/network_data.csv"
data_path

'data/network_data.csv'

In [4]:
df = pd.read_csv(data_path, sep=";")
df

Unnamed: 0,Node Number,Thread Number,Spatial Distribution,Temporal Distribution,T/R,Processor Utilization,Channel Waiting Time,Input Waiting Time,Network Response Time,Channel Utilization,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,64,4,UN,Client-Server,01,0839546,1974686,308491814,700514102,0352431,,,,,
1,64,4,UN,Client-Server,02,0827412,9556437,291037663,864599227,0506302,,,,,
2,64,4,UN,Client-Server,03,0802605,27027618,264928002,839372851,0638516,,,,,
3,64,4,UN,Client-Server,04,0723403,61848511,235776888,1256053108,0767051,,,,,
4,64,4,UN,Client-Server,05,072121,121085884,189680044,1343875577,0807812,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,16,10,PS,Asynchronous,06,079234,532737732,363094043,1214057768,0961042,,,,,
636,16,10,PS,Asynchronous,07,0707199,75167511,224638088,1653736882,0981944,,,,,
637,16,10,PS,Asynchronous,08,061823,905326843,206922631,1421304936,09625,,,,,
638,16,10,PS,Asynchronous,09,0564482,1048269897,154713952,2371043062,0979375,,,,,


In [5]:
df.columns

Index(['Node Number', 'Thread Number', 'Spatial Distribution',
       'Temporal Distribution', 'T/R', 'Processor Utilization ',
       'Channel Waiting Time', 'Input Waiting Time', 'Network Response Time',
       'Channel Utilization', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12',
       'Unnamed: 13', 'Unnamed: 14'],
      dtype='object')

In [6]:
df.dtypes

Node Number                 int64
Thread Number               int64
Spatial Distribution       object
Temporal Distribution      object
T/R                        object
Processor Utilization      object
Channel Waiting Time       object
Input Waiting Time         object
Network Response Time      object
Channel Utilization        object
Unnamed: 10               float64
Unnamed: 11               float64
Unnamed: 12               float64
Unnamed: 13               float64
Unnamed: 14               float64
dtype: object

In [7]:
del(df["Spatial Distribution"])
del(df["Temporal Distribution"])
for index in range(10, 15): 
    del(df[f"Unnamed: {index}"])

In [8]:
for column in df.columns:
    df[column] = df[column].apply(lambda value: str(value).replace(",", "."))

In [9]:
df

Unnamed: 0,Node Number,Thread Number,T/R,Processor Utilization,Channel Waiting Time,Input Waiting Time,Network Response Time,Channel Utilization
0,64,4,0.1,0.839546,1.974686,308.491814,700.514102,0.352431
1,64,4,0.2,0.827412,9.556437,291.037663,864.599227,0.506302
2,64,4,0.3,0.802605,27.027618,264.928002,839.372851,0.638516
3,64,4,0.4,0.723403,61.848511,235.776888,1256.053108,0.767051
4,64,4,0.5,0.72121,121.085884,189.680044,1343.875577,0.807812
...,...,...,...,...,...,...,...,...
635,16,10,0.6,0.79234,532.737732,363.094043,1214.057768,0.961042
636,16,10,0.7,0.707199,751.67511,224.638088,1653.736882,0.981944
637,16,10,0.8,0.61823,905.326843,206.922631,1421.304936,0.9625
638,16,10,0.9,0.564482,1048.269897,154.713952,2371.043062,0.979375


In [10]:
targets = df["Channel Utilization"]
targets

0      0.352431
1      0.506302
2      0.638516
3      0.767051
4      0.807812
         ...   
635    0.961042
636    0.981944
637      0.9625
638    0.979375
639    0.980903
Name: Channel Utilization, Length: 640, dtype: object

In [11]:
del(df["Channel Utilization"])

In [12]:
df

Unnamed: 0,Node Number,Thread Number,T/R,Processor Utilization,Channel Waiting Time,Input Waiting Time,Network Response Time
0,64,4,0.1,0.839546,1.974686,308.491814,700.514102
1,64,4,0.2,0.827412,9.556437,291.037663,864.599227
2,64,4,0.3,0.802605,27.027618,264.928002,839.372851
3,64,4,0.4,0.723403,61.848511,235.776888,1256.053108
4,64,4,0.5,0.72121,121.085884,189.680044,1343.875577
...,...,...,...,...,...,...,...
635,16,10,0.6,0.79234,532.737732,363.094043,1214.057768
636,16,10,0.7,0.707199,751.67511,224.638088,1653.736882
637,16,10,0.8,0.61823,905.326843,206.922631,1421.304936
638,16,10,0.9,0.564482,1048.269897,154.713952,2371.043062


In [13]:
scaler = StandardScaler()
scaler

In [14]:
X_standardized = scaler.fit_transform(df)
X_standardized

array([[ 1.        , -1.34164079, -1.5666989 , ..., -0.98377711,
        -0.10600057, -0.66884868],
       [ 1.        , -1.34164079, -1.21854359, ..., -0.96391276,
        -0.18073814, -0.53230077],
       [ 1.        , -1.34164079, -0.87038828, ..., -0.91813791,
        -0.29253805, -0.55329359],
       ...,
       [-1.        ,  1.34164079,  0.87038828, ...,  1.3830243 ,
        -0.54091336, -0.06902294],
       [-1.        ,  1.34164079,  1.21854359, ...,  1.75753807,
        -0.76446761,  0.72132753],
       [-1.        ,  1.34164079,  1.5666989 , ...,  2.11485012,
        -0.8429935 ,  0.90600713]])

In [15]:
(len(X_standardized), len(X_standardized[0])), len(targets)

((640, 7), 640)

In [16]:
type(X_standardized), type(targets)

(numpy.ndarray, pandas.core.series.Series)

In [17]:
targets = np.asarray(targets, dtype=float)

---

In [18]:
experiment_name = 'RegressionWithMLflow'
try:
    exp_id = mlflow.create_experiment(name=experiment_name)
except Exception as e:
    exp_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

In [19]:
mlflow.sklearn.autolog(max_tuning_runs=None) # enable autologging

In [39]:
parameters_svc = {"kernel": ["linear", "rbf"], "C": [0.1, 1]}
svc_model = SVR()
svc_model

In [21]:
indexes = ShuffleSplit(n_splits = 4, test_size = 0.2, random_state = 2023)
indexes

ShuffleSplit(n_splits=4, random_state=2023, test_size=0.2, train_size=None)

In [22]:
mse = make_scorer(mean_squared_error)
scores = {'mse': mse}
mse

make_scorer(mean_squared_error)

In [23]:
clf = GridSearchCV(svc_model, parameters_svc, scoring = scores, cv = indexes, return_train_score = True, refit = "mse", verbose = 1)
clf

In [27]:
with mlflow.start_run(experiment_id=exp_id):
    clf.fit(X_standardized, targets)

Fitting 4 folds for each of 4 candidates, totalling 16 fits


In [28]:
outcomes = pd.DataFrame(clf.cv_results_)
outcomes = outcomes[['params', 'mean_train_mse', 'mean_test_mse', 'std_test_mse']]

In [29]:
outcomes

Unnamed: 0,params,mean_train_mse,mean_test_mse,std_test_mse
0,"{'C': 0.1, 'kernel': 'linear'}",48311.241049,60033.889293,53197.563958
1,"{'C': 0.1, 'kernel': 'rbf'}",48311.187954,60035.663902,53198.914133
2,"{'C': 1, 'kernel': 'linear'}",48311.469312,60034.031243,53197.654425
3,"{'C': 1, 'kernel': 'rbf'}",48297.25577,60025.544113,53189.720907


---

**GridSearch for Several Models**

In [33]:
experiment_name = 'GridSearchRegressionMLflow'
try:
    exp_id = mlflow.create_experiment(name=experiment_name)
except Exception as e:
    exp_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

In [40]:
rf_model = RandomForestRegressor()
parameters_rf = {'n_estimators': [5, 10, 15], 
                     'max_features': [2, 4, 6]}
rf_model

In [41]:
models = {svc_model: parameters_svc, rf_model: parameters_rf}
models

{SVR(): {'kernel': ['linear', 'rbf'], 'C': [0.1, 1]},
 RandomForestRegressor(): {'n_estimators': [5, 10, 15],
  'max_features': [2, 4, 6]}}

In [43]:
for model, parameters in models.items():
    run_name = str(model)
    print(run_name)
    with mlflow.start_run(experiment_id=exp_id, run_name=run_name):
        clf = GridSearchCV(model, parameters, scoring = scores, cv = indexes, 
                           return_train_score = True, refit = "mse", verbose = 1)
        clf.fit(X_standardized, targets)
        outcomes = pd.DataFrame(clf.cv_results_)
        outcomes = outcomes[['params', 'mean_train_mse', 'mean_test_mse', 'std_test_mse']]
        print(outcomes)

SVR()
Fitting 4 folds for each of 4 candidates, totalling 16 fits
                           params  mean_train_mse  mean_test_mse  std_test_mse
0  {'C': 0.1, 'kernel': 'linear'}    48311.241049   60033.889293  53197.563958
1     {'C': 0.1, 'kernel': 'rbf'}    48311.187954   60035.663902  53198.914133
2    {'C': 1, 'kernel': 'linear'}    48311.469312   60034.031243  53197.654425
3       {'C': 1, 'kernel': 'rbf'}    48297.255770   60025.544113  53189.720907
RandomForestRegressor()
Fitting 4 folds for each of 9 candidates, totalling 36 fits
                                    params  mean_train_mse  mean_test_mse  \
0   {'max_features': 2, 'n_estimators': 5}     2129.921811   19108.166702   
1  {'max_features': 2, 'n_estimators': 10}      745.366433    5618.135655   
2  {'max_features': 2, 'n_estimators': 15}     1307.430345    5793.386145   
3   {'max_features': 4, 'n_estimators': 5}     1220.732996    5657.279544   
4  {'max_features': 4, 'n_estimators': 10}      732.878392    2677.676