In [1]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.26.0 to work with mlw_sdk


In [5]:
from azureml.core import Dataset
print("Datasets:")
for dataset_name in list(ws.datasets.keys()):
    dataset = Dataset.get_by_name(ws, dataset_name)
    print("\t", dataset.name, 'version', dataset.version, 'ID', dataset.id)

Datasets:
	 feature_selected_train_FD001.csv version 1 ID a5e50b39-f65a-46b3-bcb5-664e2f7d7250
	 feature_selected_test_FD001.csv version 1 ID 0bc4c13e-638b-48ad-a070-29bfc88faee0
	 train_FD001.csv version 1 ID 3581f274-050c-49bb-99b7-a233646c6a1d
	 test_FD001.csv version 2 ID 95ec0879-e965-4b91-9e37-4b2aeec852ef
	 RUL_FD001.csv version 2 ID 549f56bd-c249-4602-bf96-ed07ee716af9


In [11]:
from azureml.core import Experiment
from azureml.core import Model
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression

# Create an Azure ML experiment in your workspace
experiment = Experiment(workspace=ws, name='linear_reg_exp')
run = experiment.start_logging()
print("Starting experiment:", experiment.name)

# load the diabetes dataset
print("Loading Data...")

train = Dataset.get_by_id(ws, 'a5e50b39-f65a-46b3-bcb5-664e2f7d7250').to_pandas_dataframe()
test = Dataset.get_by_id(ws, '0bc4c13e-638b-48ad-a070-29bfc88faee0').to_pandas_dataframe()
y_test = Dataset.get_by_id(ws, '549f56bd-c249-4602-bf96-ed07ee716af9').to_pandas_dataframe()

index_names = train.columns[[0, 1]]
setting_names = train.columns[[2]]
sensor_names = train.drop(index_names.union(setting_names), axis = 1).columns # Find something better than union!!
scale_columns = sensor_names
keep_columns = scale_columns.union(index_names[[1]])

def scale_data(train, test, columns):
    sc = MinMaxScaler()
    scaled_train = train.copy()
    scaled_test = test.copy()
    scaled_train[columns] = pd.DataFrame(sc.fit_transform(scaled_train[columns]))
    scaled_test[columns] = pd.DataFrame(sc.transform(scaled_test[columns]))
    return scaled_train, scaled_test

def add_remaining_useful_life(df):
    grouped_by_unit = df.groupby(by="unit_no")
    max_cycle = grouped_by_unit["time_cycles"].max()
    
    result_frame = df.merge(max_cycle.to_frame(name='max_cycle'), left_on='unit_no', right_index=True)
    remaining_useful_life = result_frame["max_cycle"] - result_frame["time_cycles"]
    result_frame["RUL"] = remaining_useful_life
    
    result_frame = result_frame.drop("max_cycle", axis=1)
    return result_frame

def evaluate(y_true, y_hat, label):
    mse = mean_squared_error(y_true, y_hat)
    rmse = np.sqrt(mse)
    variance = r2_score(y_true, y_hat)
    print('{} set RMSE:{}, R2:{}'.format(label, rmse, variance))
    return rmse, variance
    
x_train = add_remaining_useful_life(train)
y_train = x_train.pop('RUL')
x_test = test.groupby(by="unit_no").last().reset_index()
x_train_scaled, x_test_scaled = scale_data(x_train, x_test, scale_columns)

reg = LinearRegression()
reg.fit(x_train_scaled, y_train)
y_hat_train = reg.predict(x_train_scaled)
eva = evaluate(y_train, y_hat_train, 'Train')

run.log('RMSE', np.float(eva[0]))
run.log('R2', np.float(eva[1]))

# Save the trained model
model_file = 'linear_reg.pkl'
joblib.dump(value=reg, filename=model_file)
run.upload_file(name = 'outputs/' + model_file, path_or_stream = './' + model_file)

# Complete the run
run.complete()

# Register the model
run.register_model(model_path='outputs/linear_reg.pkl', model_name='linear_reg',
                   properties={'RMSE': run.get_metrics()['RMSE'], 'R2': run.get_metrics()['R2']})

print('Model trained and registered.')

Starting experiment: linear_reg_exp
Loading Data...
Train set RMSE:39.57671741181994, R2:0.669857201860868
Model trained and registered.
