In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import sqlite3

# Load the true positive predictions from the CSV file
conn = sqlite3.connect('../../data/nfp2/nfp2.db')  # Adjust the path to your database file

# Step 2 & 3: Query the database and load the data into a pandas DataFrame
query = "SELECT * FROM stellarators"  # Adjust your query as needed
data_df = pd.read_sql_query(query, conn)

data_df_clean = data_df[data_df['convergence'] == 1]
data_df_clean = data_df_clean.dropna(subset=['quasisymmetry'])


X = data_df_clean[['rbc_1_0', 'rbc_m1_1', 'rbc_0_1', 'rbc_1_1','zbs_1_0', 'zbs_m1_1', 'zbs_0_1', 'zbs_1_1']] 
Y = np.log(data_df_clean['quasisymmetry'])


# Print NaN counts in the input features
print("NaN counts in input features:")
print(X.isna().sum())

# Print NaN counts in the target variable
print("NaN counts in target variable:")
print(Y.isna().sum())

# Split the data into training and testing sets
features_no_outliers, test_features_no_outliers, target_no_outliers, test_target_no_outliers = train_test_split(X, Y, test_size=0.2, random_state=42)

#print('Best trial:', study.best_trial)
#print('Best value:', study.best_value)
#print('Best parameters:', study.best_params)

import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# Assuming study.best_params already includes the best hyperparameters from your Optuna study for a regression problem
#model = lgb.LGBMRegressor(**study.best_params)

best_params_manual = {
    "boosting_type": "dart",
    "max_depth": 35,
    "num_leaves": 449,
    "min_data_in_leaf": 100,
    "feature_fraction": 0.9824756723113014,
    "learning_rate": 0.24479713555351562,
    "num_iterations": 2917,
    "data_sample_strategy": "bagging",
    "max_bins": 1236
}

# Create and train the LightGBM model with the manually defined best parameters
model = lgb.LGBMRegressor(**best_params_manual)

# Assuming features_no_outliers and target_no_outliers are your feature matrix and target vector, respectively
model.fit(features_no_outliers, target_no_outliers)

# After fitting, you can use the model to predict or evaluate it further
# For example, to predict new values
predictions = model.predict(test_features_no_outliers)

mse = mean_squared_error(test_target_no_outliers, predictions)
mae = mean_absolute_error(test_target_no_outliers, predictions)
r2 = r2_score(test_target_no_outliers, predictions)

print(f"Test MSE: {mse}")
print(f"Test MAE: {mae}")
print(f"Test R^2: {r2}")

df_predictions = pd.DataFrame({
    "Predicted": predictions.flatten(),  # Flatten in case the predictions are in a 2D array
    "Type": "Predicted"
})
df_actual = pd.DataFrame({
    "Predicted": np.tile(test_target_no_outliers, (len(predictions) // len(test_target_no_outliers))),
    "Type": "Actual"
})

import matplotlib.pyplot as plt
import seaborn as sns

for i in range(len(predictions)):
    if predictions[i] < 0:
        print(f"Predicted: {predictions[i]}, Actual: {test_target_no_outliers.iloc[i]}")

# Combine and plot
#df_combined = pd.concat([df_predictions, df_actual])
print(predictions.shape)
print(test_target_no_outliers.shape)

plt.figure(figsize=(10, 6))
sns.kdeplot(predictions, fill=True, color="blue", alpha=0.5, label="Predicted")
sns.kdeplot(test_target_no_outliers, fill=True, color="orange", alpha=0.5, label="Actual")
plt.title('Density Plot of Predicted Outputs vs Actual Values')
plt.xlabel('Values')
plt.ylabel('Density')
plt.legend()
plt.show()

lgb.plot_importance(model, max_num_features=10)
plt.title('Feature Importance')
plt.show()

