In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
data = pd.read_csv('/content/rocksdb_performance_data_fillrandom.csv')


In [None]:

# Define the feature columns and target column
features = [
    'max_background_compactions',
    'max_background_flushes',
    'write_buffer_size',
    'max_write_buffer_number',
    'min_write_buffer_number_to_merge',
    'max_bytes_for_level_multiplier',
    'block_size',
    'level0_file_num_compaction_trigger',
    'level0_slowdown_writes_trigger',
    'level0_stop_writes_trigger',
    'target_file_size_multiplier',
    'target_file_size_base'
]
target_ops = 'ops_per_sec'
target_micro = 'micros_per_op'



In [None]:
# Split the data into training and testing sets
X = data[features]
y_ops = data[target_ops]
y_micro = data[target_micro]

X_train, X_test, y_train_ops, y_test_ops = train_test_split(X, y_ops, test_size=0.2, random_state=42)
X_train, X_test, y_train_micro, y_test_micro = train_test_split(X, y_micro, test_size=0.2, random_state=42)



In [None]:
# Build and train the random forest model for ops_per_sec
rf_ops = RandomForestRegressor(n_estimators=100, random_state=42)
rf_ops.fit(X_train, y_train_ops)
y_pred_ops = rf_ops.predict(X_test)
print(f'Random Forest MSE for ops_per_sec: {mean_squared_error(y_test_ops, y_pred_ops)}')

# Build and train the random forest model for micros_per_op
rf_micro = RandomForestRegressor(n_estimators=100, random_state=42)
rf_micro.fit(X_train, y_train_micro)
y_pred_micro = rf_micro.predict(X_test)
print(f'Random Forest MSE for micros_per_op: {mean_squared_error(y_test_micro, y_pred_micro)}')



In [None]:
# Feature importance
importance_ops = rf_ops.feature_importances_
importance_micro = rf_micro.feature_importances_

# Create a DataFrame for feature importance
feature_importance_ops = pd.DataFrame({'Feature': features, 'Importance': importance_ops})
feature_importance_ops = feature_importance_ops.sort_values(by='Importance', ascending=False)

feature_importance_micro = pd.DataFrame({'Feature': features, 'Importance': importance_micro})
feature_importance_micro = feature_importance_micro.sort_values(by='Importance', ascending=False)

print(feature_importance_ops)
print(feature_importance_micro)



In [None]:
# Visualization
# Feature Importance Bar Plot for ops_per_sec
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_ops)
plt.title('Feature Importance for ops_per_sec')
plt.show()




In [None]:
# Feature Importance Bar Plot for micros_per_op
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_micro)
plt.title('Feature Importance for micros_per_op')
plt.show()



In [None]:
# Pairplot to see relationships between features and target
sns.pairplot(data, y_vars=[target_ops, target_micro], x_vars=features)
plt.show()


ANOVA


In [None]:
# Perform ANOVA for each parameter to see if it has an impact on ops_per_sec
anova_results = {}
for feature in features:
    groups = [group[1].values for group in data.groupby(feature)[target_ops]]
    anova_results[feature] = stats.f_oneway(*groups)

# Print ANOVA results
for feature, result in anova_results.items():
    print(f'ANOVA results for {feature}: F-statistic = {result.statistic}, p-value = {result.pvalue}')




In [None]:
# Visualization of ANOVA results (p-values)
anova_df = pd.DataFrame({
    'Feature': features,
    'F-statistic': [anova_results[feature].statistic for feature in features],
    'p-value': [anova_results[feature].pvalue for feature in features]
})
anova_df = anova_df.sort_values(by='p-value')

plt.figure(figsize=(10, 6))
sns.barplot(x='p-value', y='Feature', data=anova_df)
plt.axvline(x=0.05, color='r', linestyle='--')
plt.title('ANOVA p-values for each feature (impact on ops_per_sec)')
plt.xlabel('p-value')
plt.ylabel('Feature')
plt.show()