In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.impute import KNNImputer

In [20]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [21]:
df = pd.read_csv("../Data_Preprocessing/df_nnz_Vmax_clean.csv")

In [22]:
df

Unnamed: 0,X,IR,pot2,ph,temp,dstr,cryst,lgCmin,lgCmax,lgCconst,...,EState_VSA6_log,EState_VSA4_log,SMR_VSA7_log,Complexity1_log,TPSA_log,TPSA1_log,TPSA2_log,MaxEStateIndex.1_log,MaxEStateIndex.2_log,Vmaxlog
0,2.757,1.071,-0.05,4.00,37.0,3,7.0,0.176091,0.176091,2.000000,...,0.500000,2.552693,2.497497,5.420535,3.618993,3.951244,3.701302,1.790549,1.791759,0.138713
1,2.757,1.071,-0.05,4.00,37.0,3,7.0,0.176091,0.176091,2.000000,...,0.500000,2.552693,2.497497,5.420535,3.618993,3.951244,3.701302,1.790549,1.791759,-0.577903
2,2.757,1.071,-0.05,4.00,37.0,3,7.0,-0.301030,1.397940,2.000000,...,0.500000,2.552693,2.497497,0.000000,3.618993,3.701302,3.951244,1.791759,1.790549,0.922362
3,2.757,1.071,-0.05,4.00,37.0,3,7.0,-0.301030,1.397940,2.000000,...,0.500000,2.552693,2.497497,0.000000,3.618993,3.701302,3.951244,1.791759,1.790549,-0.358328
4,2.757,1.071,-0.05,4.00,37.0,3,7.0,-0.698970,2.000000,1.176091,...,0.500000,2.552693,2.497497,5.420535,3.618993,3.951244,3.701302,1.790549,1.791759,0.011655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1054,2.280,1.350,1.20,7.00,41.4,3,6.8,-1.221849,-0.619789,0.000000,...,0.500000,0.000000,0.000000,5.420535,4.883786,3.951244,3.823847,1.790549,0.000000,-5.821023
1055,2.280,1.350,1.20,3.00,33.0,1,6.8,-2.000000,-0.795880,0.000000,...,0.960420,3.402360,4.785979,5.420535,5.928924,3.951244,3.701302,1.790549,0.000000,-3.598599
1056,2.280,1.350,1.20,7.00,33.0,1,6.8,-1.221849,-0.619789,0.000000,...,0.960420,3.402360,4.785979,5.420535,5.928924,3.951244,3.701302,1.790549,0.000000,-3.752027
1057,1.937,0.853,0.78,4.00,42.4,1,6.8,-2.000000,-0.221849,0.000000,...,0.500000,0.000000,0.000000,5.420535,3.401197,3.951244,3.823847,1.790549,0.000000,-4.389340


In [23]:
features = df.drop('Vmaxlog', axis=1)
labels = df['Vmaxlog']

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [24]:
for data in [y_train, y_val, y_test]:
  print(round(len(data) / len(labels), 2))

0.6
0.2
0.2


In [25]:
# Save the data
X_train.to_csv('../Data_ML/train_features.csv', index=False)
X_val.to_csv('../Data_ML/val_features.csv', index=False)
X_test.to_csv('../Data_ML/test_features.csv', index=False)

y_train.to_csv('../Data_ML/train_labels.csv', index=False)
y_val.to_csv('../Data_ML/val_labels.csv', index=False)
y_test.to_csv('../Data_ML/test_labels.csv', index=False)

Cross Validation

In [26]:
train_features = pd.read_csv('../Data_ML/train_features.csv')
train_labels = pd.read_csv('../Data_ML/train_labels.csv')

val_features = pd.read_csv('../Data_ML/val_features.csv')
val_labels = pd.read_csv('../Data_ML/val_labels.csv')

test_features = pd.read_csv('../Data_ML/test_features.csv')
test_labels = pd.read_csv('../Data_ML/test_labels.csv')

In [27]:
rf = RandomForestRegressor()

scores = cross_val_score(rf, train_features, train_labels.values.ravel(), cv=5)

In [28]:
scores

array([0.26667703, 0.18105263, 0.34879502, 0.41394938, 0.40041581])

Hyperparameter Tuning

In [29]:
rf = RandomForestRegressor()

hyperparams = {
    'n_estimators': [5, 25, 50, 100],
    'max_depth': [2, 12, 24, None]
}

cross_val = GridSearchCV(rf, hyperparams, cv=5)
cross_val.fit(train_features, train_labels.values.ravel())

In [30]:
def results(results):
  print('Optimal Hyperparams: {}\n'.format(results.best_params_))
  means = results.cv_results_['mean_test_score']
  stds = results.cv_results_['std_test_score']

  for mean, std, params in zip(means, stds, results.cv_results_['params']):
    print('Mean {} Standard Deviation {} Hyperparameters {}'.format(round(mean,3), round(std * 2, 3), params))

In [31]:
results(cross_val)

Optimal Hyperparams: {'max_depth': 24, 'n_estimators': 100}

Mean 0.109 Standard Deviation 0.123 Hyperparameters {'max_depth': 2, 'n_estimators': 5}
Mean 0.121 Standard Deviation 0.073 Hyperparameters {'max_depth': 2, 'n_estimators': 25}
Mean 0.129 Standard Deviation 0.085 Hyperparameters {'max_depth': 2, 'n_estimators': 50}
Mean 0.121 Standard Deviation 0.086 Hyperparameters {'max_depth': 2, 'n_estimators': 100}
Mean 0.081 Standard Deviation 0.295 Hyperparameters {'max_depth': 12, 'n_estimators': 5}
Mean 0.276 Standard Deviation 0.248 Hyperparameters {'max_depth': 12, 'n_estimators': 25}
Mean 0.289 Standard Deviation 0.191 Hyperparameters {'max_depth': 12, 'n_estimators': 50}
Mean 0.312 Standard Deviation 0.172 Hyperparameters {'max_depth': 12, 'n_estimators': 100}
Mean 0.178 Standard Deviation 0.271 Hyperparameters {'max_depth': 24, 'n_estimators': 5}
Mean 0.294 Standard Deviation 0.223 Hyperparameters {'max_depth': 24, 'n_estimators': 25}
Mean 0.289 Standard Deviation 0.161 Hyperpar

In [32]:
rf1 = RandomForestRegressor(n_estimators=100, max_depth=24)
rf1.fit(train_features, train_labels.values.ravel())

rf2 = RandomForestRegressor(n_estimators=100, max_depth=None)
rf2.fit(train_features, train_labels.values.ravel())

rf3 = RandomForestRegressor(n_estimators=50, max_depth=None)
rf3.fit(train_features, train_labels.values.ravel())

Model Evaluation

In [33]:
for mdl in [rf1, rf2, rf3]:
    y_pred = mdl.predict(val_features)
    mse = mean_squared_error(val_labels, y_pred)
    r2 = r2_score(val_labels, y_pred)
    mae = mean_absolute_error(val_labels, y_pred)
    rmse = np.sqrt(mse)  # RMSE is the square root of MSE
    
    print('Max Depth: {} || Estimators: {} || MSE: {:.4f} || R-squared: {:.4f} || MAE: {:.4f} || RMSE: {:.4f}'.format(
        mdl.max_depth, mdl.n_estimators, mse, r2, mae, rmse))

Max Depth: 24 || Estimators: 100 || MSE: 1.0962 || R-squared: 0.5657 || MAE: 0.6593 || RMSE: 1.0470
Max Depth: None || Estimators: 100 || MSE: 1.1105 || R-squared: 0.5600 || MAE: 0.6592 || RMSE: 1.0538
Max Depth: None || Estimators: 50 || MSE: 1.1559 || R-squared: 0.5421 || MAE: 0.6670 || RMSE: 1.0751


In [34]:
y_pred = mdl.predict(val_features)
mse = mean_squared_error(test_labels, y_pred)
r2 = r2_score(test_labels, y_pred)
mae = mean_absolute_error(test_labels, y_pred)
rmse = np.sqrt(mse)  # RMSE is the square root of MSE
    
print('Max Depth: {} || Estimators: {} || MSE: {:.4f} || R-squared: {:.4f} || MAE: {:.4f} || RMSE: {:.4f}'.format(
        rf1.max_depth, rf1.n_estimators, mse, r2, mae, rmse))

Max Depth: 24 || Estimators: 100 || MSE: 3.2967 || R-squared: -0.5292 || MAE: 1.1653 || RMSE: 1.8157


In [35]:
df.describe()

Unnamed: 0,X,IR,pot2,ph,temp,dstr,cryst,lgCmin,lgCmax,lgCconst,...,EState_VSA6_log,EState_VSA4_log,SMR_VSA7_log,Complexity1_log,TPSA_log,TPSA1_log,TPSA2_log,MaxEStateIndex.1_log,MaxEStateIndex.2_log,Vmaxlog
count,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,...,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0
mean,2.527929,1.206676,0.292785,4.537616,33.278942,2.713881,6.036449,-1.250548,0.568012,0.485323,...,0.597158,0.951568,0.963771,2.958695,2.406137,3.927422,3.836257,1.793631,1.51495,-3.744783
std,0.268633,0.179893,0.580628,1.218244,9.4321,0.642174,1.604034,1.413556,1.204909,1.215957,...,0.176733,1.238118,1.380037,2.688157,2.25732,0.475094,0.218885,0.252684,0.668308,1.400119
min,1.83,0.7,-1.358,2.0,15.0,1.0,0.0,-4.0,-4.0,-2.30103,...,0.5,0.0,0.0,0.0,0.0,3.701302,3.701302,0.0,0.0,-7.468521
25%,2.28,1.083,-0.0776,4.0,25.0,3.0,6.0,-2.30103,-0.221849,-0.143004,...,0.5,0.0,0.0,0.0,0.0,3.701302,3.701302,1.790549,1.790549,-4.49485
50%,2.579,1.224,0.0692,4.0,31.2,3.0,7.0,-1.30103,0.20412,0.0,...,0.5,0.0,0.0,5.420535,3.011113,3.951244,3.701302,1.790549,1.790549,-3.973875
75%,2.75,1.32,0.7,4.5,40.0,3.0,7.0,-0.30103,1.39794,1.176091,...,0.5,2.562298,2.547795,5.420535,4.674883,3.951244,3.951244,1.791759,1.791759,-3.384576
max,3.17,1.71,1.69,10.0,90.0,3.0,7.0,2.944483,3.60206,3.875061,...,0.991476,4.433535,5.175934,6.190315,7.051345,6.783325,6.783325,2.496743,2.52104,4.777318


Определила верхний и нижний порог для выбросов (например, с использованием межквартильного размаха)

In [36]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

Подсчет выбросов для каждого столбца

In [37]:
outliers = ((df < lower_bound) | (df > upper_bound)).sum()
outliers = outliers.sort_values(ascending=False)