# Load Data

In [76]:
import time
import json
import pandas as pd
import os
time_1 = time.time()
with open("../test_resourses/training-db/train_data.json") as file_train:
    train_data = json.load(file_train)
    train_data = pd.DataFrame.from_dict(train_data)
    
time_load_train = time.time() - time_1
print("The time to load the train data is equal to: {} seconds".format(time_load_train))

time_2 = time.time()
with open("../test_resourses/training-db/predict_data.json") as file_predict:
    predict_data = json.load(file_predict)
    predict_data = pd.DataFrame.from_dict(predict_data)
time_load_predict = time.time() - time_2
print("The time to load the predict data is equal to: {} seconds".format(time_load_predict))
print("The time to load all the data is equal to: {} seconds".format(time_load_train + time_load_predict))   

The time to load the train data is equal to: 0.6189999580383301 seconds
The time to load the predict data is equal to: 0.37000370025634766 seconds
The time to load all the data is equal to: 0.9890036582946777 seconds


# Train Model

In [63]:
from sklearn import ensemble
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
import pickle

def train(dataset): 
    X = dataset[dataset.columns[3:]]
    Y = dataset[dataset.columns[2]]
    
    #Clustering the data
    X_cluster = X[['setting1', 'setting2', 'setting3']]

    #creates the clusters
    kmeans = KMeans(n_clusters=3).fit(X_cluster)
    X['settings_clusters'] = kmeans.predict(X_cluster)
    
    features = dataset.columns[3:]
    for feature in features:
        #Creating min, max and delta variables
        X['max_' + feature] = dataset.groupby('engine_id')[feature].cummax()
        X['min_' + feature] = dataset.groupby('engine_id')[feature].cummin()

        X['delta_' + feature] = dataset.groupby('engine_id')[feature].diff()
        X['delta_' + feature].fillna(0, inplace=True)
    
    ###Used parameters
    params = {'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.25}
    
    model = ensemble.GradientBoostingRegressor(**params)
    model.fit(X, Y)
    ###Save the Model
    model_repo = os.getcwd()
    if model_repo:
        file_path = os.path.join(model_repo, "model.pkl")
        pickle.dump(model, open(file_path, 'wb'))
    else:
        pickle.dump(model, open('model.pkl', 'wb'))
    
    print("R2: {}".format(model.score(X,Y)))
    
time_start = time.time()
train(train_data)
print("The total time elapsed to train the model is: {} seconds".format(time.time() - time_start))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['settings_clusters'] = kmeans.predict(X_cluster)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['max_' + feature] = dataset.groupby('engine_id')[feature].cummax()


R2: 0.9986819401295985
The total time elapsed to train the model is: 170.2638931274414 seconds


# Predict RUL

In [77]:
import copy
def feature_engineer(dataset):
    X = dataset[dataset.columns[2:]]
    engine = dataset[dataset.columns[0]].to_list()
    cycle = dataset[dataset.columns[1]].to_list()
    
    #Clustering the data
    X_cluster = X[['setting1', 'setting2', 'setting3']]

    #creates the clusters
    kmeans = KMeans(n_clusters=3).fit(X_cluster)
    X['settings_clusters'] = kmeans.predict(X_cluster)
    
    features = dataset.columns[2:]
    for feature in features:
        #Creating min, max and delta variables
        X['max_' + feature] = dataset.groupby('engine_id')[feature].cummax()
        X['min_' + feature] = dataset.groupby('engine_id')[feature].cummin()

        X['delta_' + feature] = dataset.groupby('engine_id')[feature].diff()
        X['delta_' + feature].fillna(0, inplace=True)

    X.insert(0, 'engine_id', engine)
    X.insert(1, 'cycle', cycle)
    return X

# make prediction
def predict(dataset):
    model_repo = os.getcwd()
    file_path = os.path.join(model_repo, "model.pkl")
    model = pickle.load(open(file_path, 'rb'))
    val_set2 = dataset.copy()
    features = dataset[dataset.columns[2:]]
    result = model.predict(features)
    val_set2['RUL'] = result.tolist()
    dic = val_set2.to_dict(orient='records')
    print("Some examples of the RUL values are: {}".format(result[:5]))

time_start = time.time()
predict_data = feature_engineer(predict_data)
predict(predict_data)
print("The total time elapsed is {} seconds".format(time.time() - time_start))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['settings_clusters'] = kmeans.predict(X_cluster)


Some examples of the RUL values are: [228.21881147 199.40067616 195.5864229  193.72508742 189.15052219]
The total time elapsed is 2.7740068435668945 seconds
