# Sobre

Esse notebook é uma Prova de Conceito aplicação de machine learning para previsão de itens desertos no ComprasGov.

# Bibliotecas

In [None]:
<!-- pip install pandas numpy matplotlib seaborn psycopg2 scikit-learn scipy xgboost lightgbm catboost tpot dataviz -->

# Importing Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import psycopg2
from sklearn.decomposition import PCA
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
from tpot import TPOTClassifier

# Database Connection
def read_data_from_postgres():
    try:
        connection = psycopg2.connect(
            dbname='your_db_name',
            user='your_username',
            password='your_password',
            host='localhost',
            port='5432'
        )
        
        sql_query = "SELECT * FROM comprasgov_table"  # Replace with your actual table name
        df = pd.read_sql_query(sql_query, connection)
        
    except Exception as e:
        print("An error occurred:", e)
    finally:
        connection.close()
        
    return df

# ETL for Data Exploration and Feature Engineering
def data_etl(df):
    # Your ETL code specific to ComprasGov here
    return df

# Data Visualization and Report Export
def data_visualization(df):
    # Your DataViz code specific to ComprasGov here
    # Export to PDF or your preferred format
    pass

# Remove Outliers
def remove_outliers(df):
    return df[(np.abs(stats.zscore(df.select_dtypes(include=[np.number]))) < 3).all(axis=1)]

# Run PCA
def run_pca(X):
    pca = PCA(n_components=5)  # Adjust based on your needs
    X_pca = pca.fit_transform(X)
    return X_pca

# AutoML for Model Selection and Ranking
def run_automl(X_train, y_train):
    tpot = TPOTClassifier(generations=2, population_size=20, verbosity=2)
    tpot.fit(X_train, y_train)
    return tpot

# Train, Test, and Evaluate Machine Learning Models
def train_test_evaluate_models(X, y, X_val, y_val):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Run AutoML
    automl_model = run_automl(X_train, y_train)
    
    models = {
        "AutoML": automl_model,
        "RandomForest": RandomForestClassifier(),
        "XGBoost": XGBClassifier(),
        "LightGBM": LGBMClassifier(),
        "CatBoost": CatBoostClassifier(verbose=0)
    }
    
    metrics = {}
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        # Validate with external CSV
        y_val_pred = model.predict(X_val)
        val_accuracy = accuracy_score(y_val, y_val_pred)
        
        metrics[name] = {
            'Test Accuracy': accuracy,
            'Validation Accuracy': val_accuracy,
            'Classification Report': classification_report(y_test, y_pred)
        }
        
        print(f"{name} Model Metrics:")
        print(f"Test Accuracy: {accuracy}")
        print(f"Validation Accuracy: {val_accuracy}")
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
        
    return metrics

if __name__ == "__main__":
    # Reading Data from PostgreSQL
    df = read_data_from_postgres()
    
    # ETL Operations
    df = data_etl(df)
    
    # Data Visualization and Report Export
    data_visualization(df)
    
    # Remove Outliers
    df = remove_outliers(df)
    
    # Prepare Data for Models
    X = df.drop('in_deserto', axis=1)  # 'in_deserto' is the target attribute
    y = df['in_deserto']
    
    # Run PCA
    X = run_pca(X)
    
    # Read Validation Data from CSV
    df_val = pd.read_csv('your_validation_file.csv')
    X_val = df_val.drop('in_deserto', axis=1)
    y_val = df_val['in_deserto']
    
    # Run PCA on Validation Data
    X_val = run_pca(X_val)
    
    # Train, Test, and Evaluate Models
    metrics = train_test_evaluate_models(X, y, X_val, y_val)
    
    # Comparative Metrics
    print("\nComparative Metrics:")
    for name, metric in metrics.items():
        print(f"{name} - Test Accuracy: {metric['Test Accuracy']}, Validation Accuracy: {metric['Validation Accuracy']}")