In [1]:
import pandas as pd

df_t = pd.read_parquet("../model/train_true.parquet")
df_f = pd.read_parquet("../model/train_false.parquet")

print(df_t.shape, df_f.shape)

# Concatenate the two dataframes
df = pd.concat([df_t, df_f], axis=0)
df.drop(columns=["start","end"], inplace=True)

print(df.shape)


(5278, 15) (106191, 15)
(111469, 13)


In [2]:
# import standard scaler
from sklearn.preprocessing import StandardScaler
# Initialize the scaler
scaler = StandardScaler()
# Fit the scaler to the data
scaler.fit(df.drop(columns=["name","y"]))
# Transform the data
df[df.columns[1:-1]] = scaler.transform(df.drop(columns=["name","y"]))
# Save the transformed data to a new parquet file
df.to_parquet("../model/train_scaled.parquet", index=False)
# Load the scaled data
df = pd.read_parquet("../model/train_scaled.parquet")
# Print the shape of the dataframe
print(df.shape)

(111469, 13)


In [7]:
# split the data into train, test and validation
from sklearn.model_selection import train_test_split
# X and y
X = df.drop(columns=["name","y"])
y = df["y"]
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Split the train set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
# Print shapes of the dataframes
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(71340, 11) (71340,)
(17835, 11) (17835,)
(22294, 11) (22294,)


In [None]:
# Import auto-sklearn
import autosklearn.classification
import sklearn.metrics as metrics
import time

# Set up auto-sklearn classifier
# We'll limit the time for the search to 30 minutes and set ensemble size to 50
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=30*60,  # 30 minutes
    per_run_time_limit=3*60,  # 3 minutes per run
    ensemble_size=50,
    memory_limit=10240,  # 10GB
    n_jobs=-1  # Use all available cores
)

# Fit the model to the training data
print("Training auto-sklearn ensemble...")
start_time = time.time()
automl.fit(X_train, y_train, dataset_name="fly_gene_prediction")
training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

# Get statistics of the training process
print("Statistics:")
print(automl.sprint_statistics())

# Evaluate on validation data
print("\nEvaluating on validation data:")
y_val_pred = automl.predict(X_val)
val_accuracy = metrics.accuracy_score(y_val, y_val_pred)
val_f1 = metrics.f1_score(y_val, y_val_pred)
val_precision = metrics.precision_score(y_val, y_val_pred)
val_recall = metrics.recall_score(y_val, y_val_pred)
val_roc_auc = metrics.roc_auc_score(y_val, automl.predict_proba(X_val)[:,1])

print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation F1 Score: {val_f1:.4f}")
print(f"Validation Precision: {val_precision:.4f}")
print(f"Validation Recall: {val_recall:.4f}")
print(f"Validation ROC AUC: {val_roc_auc:.4f}")

# Evaluate on test data
print("\nEvaluating on test data:")
y_test_pred = automl.predict(X_test)
test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
test_f1 = metrics.f1_score(y_test, y_test_pred)
test_precision = metrics.precision_score(y_test, y_test_pred)
test_recall = metrics.recall_score(y_test, y_test_pred)
test_roc_auc = metrics.roc_auc_score(y_test, automl.predict_proba(X_test)[:,1])

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test ROC AUC: {test_roc_auc:.4f}")

  automl = autosklearn.classification.AutoSklearnClassifier(


Training auto-sklearn ensemble...
	Models besides current dummy model: 0
	Dummy models: 1
	Models besides current dummy model: 0
	Dummy models: 1
	Models besides current dummy model: 0
	Dummy models: 1
	Models besides current dummy model: 0
	Dummy models: 1


In [None]:
# Show models in the ensemble
print("Models in the ensemble:")
for i, (weight, model) in enumerate(automl.get_models_with_weights()):
    print(f"Model {i}: Weight = {weight:.4f}, Type = {model.__class__.__name__}")

# Show feature importance (if available)
try:
    feature_importances = automl.show_models()
    print("\nFeature importances:")
    print(feature_importances)
except Exception as e:
    print(f"Could not extract feature importances: {e}")

# Save the model
import pickle
with open('../model/automl_ensemble.pkl', 'wb') as f:
    pickle.dump(automl, f)
print("\nModel saved to '../model/automl_ensemble.pkl'")