In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import sweetviz as sv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback

class DataLoader:
    def __init__(self, file_path):
        self.file_path = file_path

    def load_data(self):
        df = pd.read_csv(self.file_path)
        return df

class DataPreprocessor:
    def __init__(self, df):
        self.df = df

    def nan_values(self):
        string_cols = self.df.select_dtypes(include=['object', 'bool']).columns.tolist()
        for col in string_cols:
            self.df[col].fillna('', inplace=True)
        return self.df

    def split_data(self, target_column):
        X = self.df.drop(columns=[target_column])
        y = self.df[target_column].astype(int)
        return X, y

    def cols_type(self, X):
        num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
        cat_cols = X.select_dtypes(include=['object', 'bool']).columns.tolist()
        text_cols = []

        def remove_text_cols(cat_cols, text_cols):
            cat_cols = [col for col in cat_cols if col not in text_cols]
            return cat_cols

        cat_cols = remove_text_cols(cat_cols, text_cols)
        return num_cols, cat_cols, text_cols

    def preprocessor(self, num_cols, cat_cols, text_cols):
        transformers = [
            ('num', StandardScaler(), num_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
        ]

        for text_col in text_cols:
            transformers.append((text_col, TfidfVectorizer(), text_col))

        preprocessor = ColumnTransformer(transformers)
        return preprocessor

class DataAnalyzer:
    def __init__(self, file_path, target_column):
        self.file_path = file_path
        self.target_column = target_column

    def sweetviz_analysis(self, df, target_column, num_cols, cat_cols, text_cols):
        target_feature = target_column
        feature_config = sv.FeatureConfig(skip=(),
                                          force_num=num_cols,
                                          force_cat=cat_cols,
                                          force_text=text_cols)
        my_report = sv.analyze(df, target_feat=target_feature, feat_cfg=feature_config)
        my_report.show_html(filepath='sweetviz_report.html', open_browser=True, layout='vertical', scale=None)

    def data_visualizer(self):
        # Load data
        data_loader = DataLoader(self.file_path)
        df = data_loader.load_data()

        # Preprocess data
        preprocessor = DataPreprocessor(df)
        df = preprocessor.nan_values()
        X, y = preprocessor.split_data(self.target_column)
        num_cols, cat_cols, text_cols = preprocessor.cols_type(X)

        # Sweetviz Analysis
        self.sweetviz_analysis(df, self.target_column, num_cols, cat_cols, text_cols)

class ModelTrainer:
    def __init__(self, preprocessor):
        self.preprocessor = preprocessor

    def split_train_test(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        return X_train, X_test, y_train, y_test

    def preprocess_data(self, X_train, X_test):
        X_train = self.preprocessor.fit_transform(X_train)
        X_test = self.preprocessor.transform(X_test)
        return X_train, X_test

    def create_model(self, input_dim, layers, activation, dropouts, learning_rate):
        model = Sequential()
        model.add(Input(shape=(input_dim,)))
        for neurons, dropout in zip(layers, dropouts):
            model.add(Dense(neurons, activation=activation))
            model.add(Dropout(dropout))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy', 'auc'])
        return model

    def train_model(self, model, X_train, y_train, epochs):
        history = model.fit(X_train, y_train, epochs=epochs, batch_size=32, validation_split=0.2, verbose=1)
        return history

    def predict(self, model, X_test):
        y_pred_prob = model.predict(X_test)
        y_pred = (y_pred_prob > 0.5).astype(int)
        return y_pred, y_pred_prob

class ModelEvaluator:
    def evaluate_model(self, y_test, y_pred, y_pred_prob):
        accuracy = accuracy_score(y_test, y_pred)
        classification_report_str = classification_report(y_test, y_pred)
        confusion_matrix_values = confusion_matrix(y_test, y_pred, normalize='true')
        roc_auc = roc_auc_score(y_test, y_pred_prob)
        return accuracy, classification_report_str, confusion_matrix_values, roc_auc

    def print_results(self, accuracy, roc_auc, classification_report_str, confusion_matrix_values):
        print(f"Accuracy: {accuracy}")
        print(f"ROC AUC: {roc_auc}")
        print("Classification Report:")
        print(classification_report_str)
        print("Confusion Matrix:")
        print(confusion_matrix_values)

    def plot_metrics(self, history):
        fig1 = go.Figure()
        fig1.add_trace(go.Scatter(x=list(range(len(history.history['accuracy']))), y=history.history['accuracy'], mode='lines', name='Treinamento'))
        fig1.add_trace(go.Scatter(x=list(range(len(history.history['val_accuracy']))), y=history.history['val_accuracy'], mode='lines', name='Validação'))
        fig1.update_layout(title='Accuracy durante o Treinamento e Validação', xaxis_title='Época', yaxis_title='Accuracy')
        fig1.show()

        fig2 = go.Figure()
        fig2.add_trace(go.Scatter(x=list(range(len(history.history['auc']))), y=history.history['auc'], mode='lines', name='Treinamento'))
        fig2.add_trace(go.Scatter(x=list(range(len(history.history['val_auc']))), y=history.history['val_auc'], mode='lines', name='Validação'))
        fig2.update_layout(title='AUC durante o Treinamento e Validação', xaxis_title='Época', yaxis_title='AUC')
        fig2.show()

        fig3 = go.Figure()
        fig3.add_trace(go.Scatter(x=list(range(len(history.history['loss']))), y=history.history['loss'], mode='lines', name='Treinamento'))
        fig3.add_trace(go.Scatter(x=list(range(len(history.history['val_loss']))), y=history.history['val_loss'], mode='lines', name='Validação'))
        fig3.update_layout(title='Perda durante o Treinamento e Validação', xaxis_title='Época', yaxis_title='Perda')
        fig3.show()

In [10]:
def main(file_path, target_column, layers, dropouts, activation, learning_rate, epochs, use_sweetviz):
    # Sweetviz analysis (opcional)
    if use_sweetviz:
        data_analyzer = DataAnalyzer(file_path, target_column)
        data_analyzer.data_visualizer()
    
    # Load data
    data_loader = DataLoader(file_path)
    df = data_loader.load_data()

    # Preprocess data
    preprocessor = DataPreprocessor(df)
    df = preprocessor.nan_values()
    X, y = preprocessor.split_data(target_column)
    num_cols, cat_cols, text_cols = preprocessor.cols_type(X)
    preprocessor_instance = preprocessor.preprocessor(num_cols, cat_cols, text_cols)

    # Train model
    model_trainer = ModelTrainer(preprocessor_instance)
    X_train, X_test, y_train, y_test = model_trainer.split_train_test(X, y)
    X_train, X_test = model_trainer.preprocess_data(X_train, X_test)
    model = model_trainer.create_model(X_train.shape[1], layers, activation, dropouts, learning_rate)
    history = model_trainer.train_model(model, X_train, y_train, epochs)
    y_pred, y_pred_prob = model_trainer.predict(model, X_test)

    # Evaluate model
    model_evaluator = ModelEvaluator()
    accuracy, classification_report_str, confusion_matrix_values, roc_auc = model_evaluator.evaluate_model(y_test, y_pred, y_pred_prob)
    model_evaluator.print_results(accuracy, roc_auc, classification_report_str, confusion_matrix_values)
    model_evaluator.plot_metrics(history)

if __name__ == '__main__':
    # Main parameters
    file_path = './data/itens_desertos_etapa3_consolidada_202407040633.csv'
    target_column = 'in_deserto'
    layers = [16, 4]  # Neurons for each layer
    dropouts = [0.5, 0.5]  # Dropout for each layer
    activation = 'relu'
    learning_rate = 0.00001
    epochs = 3
    use_sweetviz = False
    main(file_path, target_column, layers, dropouts, activation, learning_rate, epochs, use_sweetviz)


Epoch 1/3
[1m26094/26094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 3ms/step - accuracy: 0.9617 - auc: 0.4982 - loss: 0.6475 - val_accuracy: 0.9618 - val_auc: 0.5000 - val_loss: 0.5076
Epoch 2/3
[1m26094/26094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 2ms/step - accuracy: 0.9619 - auc: 0.4994 - loss: 0.4728 - val_accuracy: 0.9618 - val_auc: 0.5000 - val_loss: 0.3444
Epoch 3/3
[1m26094/26094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 2ms/step - accuracy: 0.9615 - auc: 0.4990 - loss: 0.3470 - val_accuracy: 0.9618 - val_auc: 0.5000 - val_loss: 0.2397
[1m8155/8155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step



Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Accuracy: 0.9615956525404892
ROC AUC: 0.5
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98    250913
           1       0.00      0.00      0.00     10021

    accuracy                           0.96    260934
   macro avg       0.48      0.50      0.49    260934
weighted avg       0.92      0.96      0.94    260934

Confusion Matrix:
[[1. 0.]
 [1. 0.]]



Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

