# Imports

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.models as M
import tensorflow.keras.layers as L
import tensorflow.keras.backend as K
import tensorflow.keras.optimizers as O

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau

print('Версия TensoFlow =', tf.__version__)
print('Версия Keras =', keras.__version__)

Версия TensoFlow = 2.10.0
Версия Keras = 2.10.0


# Загрузка данных

In [2]:
df = pd.read_csv('..\data\ebw_data.csv')

In [3]:
df.head(3)

Unnamed: 0,IW,IF,VW,FP,Depth,Width
0,47,139,4.5,80,1.6,2.54
1,47,139,4.5,80,1.62,2.5
2,47,139,4.5,80,1.68,2.6


In [4]:
y_depth = df['Depth']
y_width = df['Width']

In [5]:
X = df.drop(['Depth', 'Width'], axis=1)

In [6]:
X.head(3)

Unnamed: 0,IW,IF,VW,FP
0,47,139,4.5,80
1,47,139,4.5,80
2,47,139,4.5,80


# EDA

In [7]:
#TODO: 

# Modeling

In [8]:
X_train, X_test, y_d_train, y_d_test = train_test_split(X, y_depth, test_size=0.2, random_state=42)

In [9]:
X_train, X_test, y_w_train, y_w_test = train_test_split(X, y_width, test_size=0.2, random_state=42)

In [10]:
def grid_optimizer(model, X, y, param_grids):
    gs = GridSearchCV(model, 
                      param_grid=param_grids, 
                      cv=5, 
                      scoring='neg_mean_squared_error')
    gs.fit(X, y)
    print(f"Best MSE-score: {gs.best_score_}")
    print(f"Best params: {gs.best_params_}")
    return gs.best_estimator_

In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
scaler.scale_, scaler.mean_, scaler.var_


(array([ 1.72448198,  5.27664937,  1.88255506, 22.33003242]),
 array([ 45.71929825, 141.26315789,   8.88596491,  79.03508772]),
 array([  2.9738381 ,  27.84302862,   3.54401354, 498.6303478 ]))

## 1. RidgeRegression

### a) Model for depth prediction

In [13]:
r_d = Ridge(random_state=42)

In [14]:
param_grid = {
    'alpha': [0.2, 0.5, 0.8, 1],
    'solver': ['auto', 'lbfgs', 'saga'],
}

In [15]:
best_r_d = grid_optimizer(r_d, X_train, y_d_train, param_grid)

Best MSE-score: -0.008533380130294022
Best params: {'alpha': 0.2, 'solver': 'auto'}


In [16]:
y_d_pred = best_r_d.predict(X_test)
print(f'RidgeRegression MSE-score: {mean_squared_error(y_d_test, y_d_pred)}')

RidgeRegression MSE-score: 0.021683206270686435


### b) Model for width prediction

In [17]:
r_w = Ridge(random_state=42)

In [18]:
param_grid = {
    'alpha': [0.2, 0.5, 0.8, 1],
    'solver': ['auto', 'lbfgs', 'saga'],
}

In [19]:
best_r_w = grid_optimizer(r_w, X_train, y_w_train, param_grid)

Best MSE-score: -0.005691052336710652
Best params: {'alpha': 1, 'solver': 'saga'}


In [20]:
y_w_pred = best_r_w.predict(X_test)
print(f'RidgeRegression MSE-score: {mean_squared_error(y_w_test, y_w_pred)}')

RidgeRegression MSE-score: 0.0025514481824656337


## 2. RandomForestRegression

### a) Model for depth prediction

In [21]:
rfr_d = RandomForestRegressor(random_state=42, n_jobs=-1)

In [22]:
param_grid = {
    'n_estimators': [20, 100, 250, 1000],
    'max_depth': [5, 8, 10, 15],
    'max_features': ['sqrt', 4],
}

In [23]:
best_rfr_d = grid_optimizer(rfr_d, X_train, y_d_train, param_grid)

Best MSE-score: -0.005329916784222797
Best params: {'max_depth': 5, 'max_features': 4, 'n_estimators': 100}


In [24]:
y_d_pred = best_rfr_d.predict(X_test)
print(f'RandomForestRegression MSE-score: {mean_squared_error(y_d_test, y_d_pred)}')

RandomForestRegression MSE-score: 0.00801286334778967


### b) Model for width prediction

In [25]:
rfr_w = RandomForestRegressor(random_state=42, n_jobs=-1)

In [26]:
param_grid = {
    'n_estimators': [20, 100, 250, 1000],
    'max_depth': [5, 8, 10, 15],
    'max_features': ['sqrt', 4],
}

In [27]:
best_rfr_w = grid_optimizer(rfr_w, X_train, y_w_train, param_grid)

Best MSE-score: -0.004090362164723459
Best params: {'max_depth': 5, 'max_features': 4, 'n_estimators': 100}


In [28]:
y_w_pred = best_rfr_w.predict(X_test)
print(f'RandomForestRegression MSE-score: {mean_squared_error(y_w_test, y_w_pred)}')

RandomForestRegression MSE-score: 0.002249071408546027


## 3. Neural Network

In [29]:
y_train = pd.concat([y_d_train, y_w_train], axis=1)
y_test = pd.concat([y_d_test, y_w_test], axis=1)

In [30]:
y_d_train.iloc[22], y_w_train.iloc[22]

(1.58, 2.52)

In [31]:
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [32]:
y_train[22]

array([1.58, 2.52])

In [33]:
K.clear_session()

In [34]:
nn_model = Sequential()
nn_model.add(Dense(6, activation='elu', input_shape=(4,)))
nn_model.add(Dense(10, activation='elu'))
nn_model.add(Dense(10, activation='elu'))
nn_model.add(Dense(6, activation='elu'))
nn_model.add(Dense(4, activation='elu'))
nn_model.add(Dense(2))
nn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 6)                 30        
                                                                 
 dense_1 (Dense)             (None, 10)                70        
                                                                 
 dense_2 (Dense)             (None, 10)                110       
                                                                 
 dense_3 (Dense)             (None, 6)                 66        
                                                                 
 dense_4 (Dense)             (None, 4)                 28        
                                                                 
 dense_5 (Dense)             (None, 2)                 10        
                                                                 
Total params: 314
Trainable params: 314
Non-trainable pa

In [35]:
BATCH_SIZE = 4
EPOCHS = 100

nn_model.compile(
  loss='mse', 
  optimizer=O.Adam(learning_rate=0.01),
  metrics=['mean_squared_error']
)


In [36]:
checkpoint = ModelCheckpoint('..\model\best_model.hdf5' , monitor = ['val_mean_squared_error'] , verbose = 1  , mode = 'min')
earlystop = EarlyStopping(monitor='val_mean_squared_error', patience=10, restore_best_weights=True)
callback_reduce_lr = ReduceLROnPlateau(monitor='val_mean_squared_error', factor=0.5, min_lr=1e-6, patience=3, verbose=1)
callbacks_list = [checkpoint, earlystop, callback_reduce_lr]

In [37]:
history = nn_model.fit(
  X_train, y_train,  # нормализованные данные
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_data=(X_test, y_test),
  shuffle=False,
  callbacks=callbacks_list
)


Epoch 1/100
 1/15 [=>............................] - ETA: 10s - loss: 2.0993 - mean_squared_error: 2.0993
Epoch 1: saving model to ..\modeest_model.hdf5


OSError: [Errno 22] Unable to create file (unable to open file: name = '..\modelest_model.hdf5', errno = 22, error message = 'Invalid argument', flags = 13, o_flags = 302)

# Сравнение моделей

In [None]:
# True value

In [None]:
y_test

In [None]:
rfr_pred = np.array((y_d_pred, y_w_pred)).reshape(-1, 2)
rfr_pred.round(decimals=2)

In [None]:
nn_pred = nn_model.predict(X_test)
nn_pred.round(decimals=2)