In [1]:
import pandas as pd
import numpy as np
import random
from sklearn import tree, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scipy.stats import pearsonr, spearmanr




In [2]:
# def load_df():
#     label_encoder = preprocessing.LabelEncoder()

#     df = pd.read_csv('data/qsar-biodeg.csv')
#     df = df.dropna()
#     cat = df.select_dtypes(exclude=['number'])
#     for col in cat.columns:
#         df[col] = label_encoder.fit_transform(df[col])

#     df = df.to_numpy()
#     target = df[:, -1]- 1 # Target is the last attribute
#     data = df[:, :-1]   # Data is everything except the last attribute

#     dataset = {'target': target,
#                'data': data,
#                'info': 'https://www.kaggle.com/datasets/muhammetvarl/qsarbiodegradation?resource=download',
#                'date_access': '2023-10-29'}

#     return dataset

# def load_df():

#     label_encoder = preprocessing.LabelEncoder()

#     df = pd.read_csv('data/heart_disease_classification.csv')
#     df = df.dropna()

#     cat = df.select_dtypes(exclude=['number'])
#     for col in cat.columns:
#         df[col] = label_encoder.fit_transform(df[col])

#     df = df.to_numpy()
#     target = df[:, -1]
#     data = df[:, 1:-1]

#     dataset = {'target': target,
#                'data': data,
#                'info': 'https://www.kaggle.com/datasets/sumaiyatasmeem/heart-disease-classification-dataset/data',
#                'date_access': '2023-10-29'}

#     return dataset

def load_df():
    label_encoder = preprocessing.LabelEncoder()
    df = pd.read_csv(
        'https://raw.githubusercontent.com/rcpsilva/MLBenchmarks/main/MLBenchmarks/datasets/Classification/breast+cancer+wisconsin+diagnostic/wdbc.data',
        header=None)
    df = df.dropna()

    cat = df.select_dtypes(exclude=['number'])

    for col in cat.columns:
        df[col] = label_encoder.fit_transform(df[col])

    df = df.to_numpy()
    target = df[:, 1]
    data = df[:, 2:-1]

    dataset = {'target': target,
               'data': data,
               'info': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic',
               'date_access': '2023-10-19'}

    return dataset


In [3]:
def generate_mlp(num_layers=None, layer_sizes=None, input_dim=10):


    if num_layers is None:
        num_layers = np.random.randint(1, 5)
    if layer_sizes is None:
        layer_sizes = [np.random.randint(32, 257) for _ in range(num_layers)]
    
    if len(layer_sizes) != num_layers:
        raise ValueError("Length of layer_sizes must match num_layers")

    model = Sequential()
    model.add(Dense(layer_sizes[0], input_dim=input_dim, activation='relu'))

    for size in layer_sizes[1:]:
        model.add(Dense(size, activation='relu'))

    model.add(Dense(1))  # Output layer

    model.compile(optimizer='adam', loss='mse')

    return model

In [4]:
def compute_metrics(model, X, y, metric):
    if metric == 'gini':
        
        decision_tree = tree.DecisionTreeClassifier(max_depth=1)
        decision_tree = decision_tree.fit(np.array(X).reshape(-1,1), y)
        leaf_indices = [i for i in range(decision_tree.tree_.node_count) if decision_tree.tree_.children_left[i] == decision_tree.tree_.children_right[i]]
        gini_index = sum(decision_tree.tree_.impurity[leaf_indices]) / len(leaf_indices)
        return gini_index
    elif metric == 'fisher':
        _, fisher_stat = pearsonr(X.flatten(), y)
        return fisher_stat
    elif metric == 'f_ratio':
        f_ratio = np.var(X) / np.var(y)
        return f_ratio
    elif metric == 'pearson':
        corr, _ = pearsonr(X.flatten(), y)
        return corr
    elif metric == 'spearman':
        corr, _ = spearmanr(X.flatten(), y)
        return corr

In [5]:
import pandas as pd
import numpy as np
import random
from sklearn import tree, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scipy.stats import pearsonr, spearmanr

In [6]:
def fitness_function(df, target, metric='gini'):
    new_feature = []
    input_dim = df['data'].shape[1]
    model = generate_mlp(input_dim=input_dim)
    
    value = model.predict(df['data'])
    new_feature.append(value)
    metric_score = compute_metrics(new_feature, np.array(new_feature).reshape(-1, 1), target, metric)
    return metric_score, new_feature

In [7]:
# Model evaluation
def evaluate_models(df, n_iterations=1, test_size=0.2, random_state=42, new=0):
    modelos = {
        'Decision Tree': DecisionTreeClassifier(random_state=random_state),
        'Neural Network': MLPClassifier(hidden_layer_sizes=(64, 64), activation='relu', solver='adam', max_iter=1000000, random_state=random_state),
        'Random Forest': RandomForestClassifier(random_state=random_state),
        'SVM': SVC(random_state=random_state),
        'Logistic Regression': LogisticRegression(max_iter=1000000, random_state=random_state),
        'XGBoost': xgb.XGBClassifier(random_state=random_state)
    }
    acuracias = {nome_modelo: [] for nome_modelo in modelos}
    f1_scores = {nome_modelo: [] for nome_modelo in modelos}
    X_train, X_test, y_train, y_test = train_test_split(df['data'], df['target'], test_size=test_size, random_state=random_state)
    for _ in range(n_iterations):
        for nome_modelo, modelo in modelos.items():
            modelo.fit(X_train, y_train)
            val_predictions = modelo.predict(X_test)
            acuracia = accuracy_score(y_test, val_predictions)
            f1 = f1_score(y_test, val_predictions, average='weighted')
            acuracias[nome_modelo].append(acuracia)
            f1_scores[nome_modelo].append(f1)
    medias_acuracias = {nome_modelo: np.mean(acuracias[nome_modelo]) for nome_modelo in acuracias}
    medias_f1_scores = {nome_modelo: np.mean(f1_scores[nome_modelo]) for nome_modelo in f1_scores}
    if new == 0:    
        sns.set(style="whitegrid")
        model_names = list(medias_acuracias.keys())
        accuracies = list(medias_acuracias.values())
        plt.figure(figsize=(10, 6))
        colors = sns.color_palette('pastel', len(model_names))
        bars = plt.bar(model_names, accuracies, color=colors)
        for bar, accuracy in zip(bars, accuracies):
            height = bar.get_height()
            plt.annotate(f'{accuracy:.2f}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')
        plt.xlabel('Modelo')
        plt.ylabel('Acurácia Média')
        plt.title('Acurácia Média de Validação dos Modelos após 1000 Iterações')
        plt.ylim(0, 1)
        plt.show()
    return medias_acuracias, medias_f1_scores

In [8]:
# metrics = ['fisher', 'f_ratio', 'pearson', 'spearman']
metrics = ['f_ratio', 'pearson', 'spearman']

# metrics = ['gini', 'fisher', 'f_ratio', 'pearson', 'spearman']
# metrics = ['pearson', 'spearman']
# metrics = ['gini']

import os
output_dir = 'resultados/wdbc'
os.makedirs(output_dir, exist_ok=True)

for metric in metrics:
    df = load_df()
    df_new = load_df()
    
    aux = pd.DataFrame(df_new['data'])
    metric_score, new_feature = fitness_function(df, df['target'], metric=metric)
    
    aux['new_feature'] = np.array(new_feature).reshape(-1, 1)
    aux = aux.to_numpy()
    
    df_new['data'] = aux

    resultados = []
    f1_scores = []
    metric_scores = []

    for _ in range(10000):  # Adjust the range for more iterations if needed
        df_new = load_df()
        aux = pd.DataFrame(df_new['data'])
        metric_score, new_feature = fitness_function(df_new, df_new['target'], metric=metric)
        aux['new_feature'] = np.array(new_feature).reshape(-1, 1)
        aux = aux.to_numpy()
        df_new['data'] = aux
        resultado, f1 = evaluate_models(df_new, new=1)
        resultado = np.array(list(resultado.values()))
        f1 = np.array(list(f1.values()))
        resultados.append(resultado)
        f1_scores.append(f1)
        metric_scores.append(metric_score)
        

    metric_scores = np.array(metric_scores)
    resultados = np.array(resultados)
    f1_scores = np.array(f1_scores)

    df_metric_scores = pd.DataFrame({metric: metric_scores})
    df_resultados = pd.DataFrame(resultados, columns=['Decision Tree', 'Neural Network', 'Random Forest', 'SVM', 'Logistic Regression', 'XGBoost'])
    df_f1_scores = pd.DataFrame(f1_scores, columns=['F1 Decision Tree', 'F1 Neural Network', 'F1 Random Forest', 'F1 SVM', 'F1 Logistic Regression', 'F1 XGBoost'])

    df_final = pd.concat([df_metric_scores, df_resultados], axis=1)
    correlations = []
    p_values = []

    for f1_col in df_f1_scores.columns:
        corr, p_val = pearsonr(df_final[metric], df_f1_scores[f1_col])
        correlations.append(corr)
        p_values.append(p_val)

    df_correlations = pd.DataFrame({
        'Model': df_f1_scores.columns,
        'Correlation': correlations,
        'P-Value': p_values
    })

    df_resultados.to_csv(os.path.join(output_dir, f'{metric}_resultados.csv'), index=False)
    df_f1_scores.to_csv(os.path.join(output_dir, f'{metric}_f1_scores.csv'), index=False)
    df_correlations.to_csv(os.path.join(output_dir, f'{metric}_correlations.csv'), index=False)














