### Libs

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn.svm import LinearSVC
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef, cohen_kappa_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, KFold
import sys
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from tabulate import tabulate
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from deslib.dcs import OLA
# from deslib.dcs import KNORA_U, KNORA_E
from deslib.des import  KNOP, METADES
from deslib.static import SingleBest, StackedClassifier
from deslib.static import StaticSelection
from imblearn.metrics import geometric_mean_score

### Feature Extraction

In [2]:
FEATURES_SET = {
    "feature": 1,
    "permission": 2,
    "activity": 3,
    "service_receiver": 3,
    "provider": 3,
    "service": 3,
    "intent": 4,
    "api_call": 5,
    "real_permission": 6,
    "call": 7,
    "url": 8
}


def count_feature_set(lines):
    """
    Count how many features belong to a specific set
    :param lines: features in the text file
    :return:
    """
    features_map = {x: 0 for x in range(1, 9)}
    for l in lines:
        if l != "\n":
            set = l.split("::")[0]
            features_map[FEATURES_SET[set]] += 1
    features = []
    for i in range(1, 9):
        features.append(features_map[i])
    return features


In [None]:
def read(LOAD_DATA=False):
    if LOAD_DATA:
        print("Previous data not loaded. Attempt to read data ...")
        mypath = r"Drebin\MetaData\feature_vectors\feature_vectors"
        onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

        print("Reading csv file for ground truth ...")
        ground_truth = np.loadtxt(r"Drebin\MetaData\sha256_family.csv", delimiter=",", skiprows=1, dtype=str)
        # print ground_truth.shape
        # families = np.unique(ground_truth[:, 1])
        # print families
        # print len(families)

        print("Reading positive and negative texts ...")
        pos = []
        neg = []
        for virus in tqdm(onlyfiles):
            if virus in ground_truth[:, 0]:
                pos.append(virus)
            else:
                neg.append(virus)

        print("Extracting features ...")
        x = []
        y = []
        for text_file in tqdm(pos):
            sys.stdin = open("%s/%s" % (mypath, text_file))
            features = sys.stdin.readlines()
            sample = count_feature_set(features)
            x.append(sample)
            y.append(1)

        for text_file in tqdm(neg):
            sys.stdin = open("%s/%s" % (mypath, text_file))
            features = sys.stdin.readlines()
            sample = count_feature_set(features)
            x.append(sample)
            y.append(0)

        print("Data is read successfully:")
        x = np.array(x)
        y = np.array(y)
        print(x.shape, y.shape)

        print("Saving data under data_numpy directory ...")
        np.save(r"x_all.npy", x)
        np.save(r"y_all.npy", y)

        return x, y
    else:
        print("Loading previous data ...")
        x_ = np.load(r"x_all.npy")
        y_ = np.load(r"y_all.npy")
        print(x_.shape, y_.shape)
        # print x == x_, y == y_
        return x_, y_


def map_family_to_category(families):
    out = {}
    count = 1
    for family in families:
        out[family] = count
        count += 1
    return out


if __name__ == "__main__":
    #x, y = read(LOAD_DATA=True)
    x, y = read()

### Dataframe

In [None]:
x_all, y_all = read(LOAD_DATA=False)
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, stratify=y_all)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

### Algorithms

In [5]:
classifiers = {
    # 'DecisionTree': DecisionTreeClassifier(criterion='gini'),
    # 'Knn': KNeighborsClassifier(n_neighbors=7, n_jobs=-1),
    'NaiveBayes': GaussianNB(),
    # 'BaggingDecisionTree': BaggingClassifier(estimator=DecisionTreeClassifier(criterion='gini'), n_estimators=100, n_jobs=-1),
    'BaggingNaiveBayes': BaggingClassifier(estimator=GaussianNB(), n_estimators=100, n_jobs=-1),
    # 'BaggingKnn': BaggingClassifier(estimator=KNeighborsClassifier(n_neighbors=7), n_estimators=100, n_jobs=-1),
    # 'GradientBoostedDecisionTree': GradientBoostingClassifier(n_estimators=100),
    # 'RandomForest': RandomForestClassifier(n_estimators=100, n_jobs=-1),
    # 'MLP': MLPClassifier(hidden_layer_sizes=(100), max_iter=1000),
    # 'BaggingMLP': BaggingClassifier(estimator=MLPClassifier(hidden_layer_sizes=(100), max_iter=1000), n_estimators=100, n_jobs=-1),
    }

DES = {
    # "SingleBest": SingleBest,
    # "StaticSelection": StaticSelection,
    # "OLA": OLA,
    # "KNOP": KNOP,
    # "METADES": METADES
}

### Saving Time Functions

In [12]:
def CreateModel(x_all, y_all, model, modelName, balance, num_rep=30):

    model_dir = os.path.join("Unbalanced", modelName)
    os.makedirs(model_dir, exist_ok=True)  # Create the directory if it doesn't exist

    acuracias = []
    precisoes = []
    recalls = []
    f1_scores = []
    roc_auc_scores = []
    gmean_scores = []
    mcc_scores = []
    cohen_scores = []

    for i in tqdm(range(num_rep)):

        if balance:
            x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, stratify=y_all, random_state = i)
            smote = SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=i)
            x_train, y_train = smote.fit_resample(x_train, y_train)
            model_dir = os.path.join("Balanced", modelName)
            os.makedirs(model_dir, exist_ok=True)  # Create the directory if it doesn't exist

        else:
            x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, stratify=y_all)

        # Treinar o modelo no conjunto de treino
        model.fit(x_train, y_train)

        # Fazer previsões no conjunto de teste
        previsoes = model.predict(x_test)
        
        # Avaliar as métricas do modelo e armazenar na lista temporaria
        cm = confusion_matrix(y_test, previsoes)
        acuracia = accuracy_score(y_test, previsoes)
        precisao = precision_score(y_test, previsoes)
        recall = recall_score(y_test, previsoes)
        f1 = f1_score(y_test, previsoes)
        roc_auc = roc_auc_score(y_test, previsoes)
        geoMedia = geometric_mean_score(y_test, previsoes)
        mcc = matthews_corrcoef(y_test, previsoes)
        kappa = cohen_kappa_score(y_test, previsoes)

        
        acuracias.append(acuracia)
        precisoes.append(precisao)
        recalls.append(recall)
        f1_scores.append(f1)
        roc_auc_scores.append(roc_auc)
        gmean_scores.append(geoMedia)
        mcc_scores.append(mcc)
        cohen_scores.append(kappa)
        
        # Salva o modelo treinado
        model_path = os.path.join(model_dir, f'model_{i+1}.joblib')
        joblib.dump(model, model_path)

        # plt.figure(figsize=(8, 6))
        # sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        # plt.xlabel('Predicted')
        # plt.ylabel('Actual')
        # plt.title('Confusion Matrix')
        # plt.show()
        print(f'Repetição {i+1} {modelName} - Acurácia: {acuracia:.2f}, Precisão: {precisao:.2f}, Recall: {recall:.2f}, F1-score: {f1:.2f}, ROC AUC: {roc_auc:.2f}')

    # Criar um DataFrame com as métricas
    metrics_df = pd.DataFrame({
        'Accuracy': [np.mean(acuracias)],
        'Precision': [np.mean(precisoes)],
        'Recall': [np.mean(recalls)],
        'F1_score': [np.mean(f1_scores)],
        'Roc_auc': [np.mean(roc_auc_scores)],
        'G-Mean': [np.mean(gmean_scores)],
        'MCC': [np.mean(mcc_scores)],
        'Cohen_Kappa': [np.mean(cohen_scores)],
        'Accuracy_std': [np.std(acuracias)],
        'Precision_std': [np.std(precisoes)],
        'Recall_std': [np.std(recalls)],
        'F1_score_std': [np.std(f1_scores)],
        'Roc_auc_std': [np.std(roc_auc_scores)],
        'G-Mean_std': [np.std(gmean_scores)],
        'MCC_std': [np.std(mcc_scores)],
        'Cohen_Kappa_std': [np.std(cohen_scores)],
    })

    # Define the directory based on the balance condition
    if balance:
        directory = os.path.join("Balanced", "Metrics")
    else:
        directory = os.path.join("Unbalanced", "Metrics")

    # Create the directory if it doesn't exist
    os.makedirs(directory, exist_ok=True)

    # Define the CSV file path
    csv_path = os.path.join(directory, f'{modelName}_metrics.csv')

    # Save the DataFrame to a CSV file
    metrics_df.to_csv(csv_path, index=False)

    # Print the path where the metrics were saved
    print(f'Métricas salvas em {csv_path}')

In [13]:
def CreateDESModel(x_all, y_all, model, modelName, balance, num_rep=30):
    if balance:
        model_dir = os.path.join("Balanced", modelName)
    else:
        model_dir = os.path.join("Unbalanced", modelName)
    
    os.makedirs(model_dir, exist_ok=True)

    acuracias = []
    precisoes = []
    recalls = []
    f1_scores = []
    roc_auc_scores = []
    gmean_scores = []
    mcc_scores = []
    cohen_scores = []

    for i in tqdm(range(num_rep)):
        x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, stratify=y_all, random_state= i)

        if balance:
            smote = SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=i)
            x_train, y_train = smote.fit_resample(x_train, y_train)

        # Train an ensemble of classifiers
        ensemble = RandomForestClassifier(n_estimators=100)
        ensemble.fit(x_train, y_train)

        # Initialize and train the DES model on the training set
        des_model = model(ensemble)
        des_model.fit(x_train, y_train)

        # Make predictions on the test set
        previsoes = des_model.predict(x_test)

        # Avaliar as métricas do modelo e armazenar na lista temporaria
        cm = confusion_matrix(y_test, previsoes)
        acuracia = accuracy_score(y_test, previsoes)
        precisao = precision_score(y_test, previsoes)
        recall = recall_score(y_test, previsoes)
        f1 = f1_score(y_test, previsoes)
        roc_auc = roc_auc_score(y_test, previsoes)
        geoMedia = geometric_mean_score(y_test, previsoes)
        mcc = matthews_corrcoef(y_test, previsoes)
        kappa = cohen_kappa_score(y_test, previsoes)

        
        acuracias.append(acuracia)
        precisoes.append(precisao)
        recalls.append(recall)
        f1_scores.append(f1)
        roc_auc_scores.append(roc_auc)
        gmean_scores.append(geoMedia)
        mcc_scores.append(mcc)
        cohen_scores.append(kappa)
        
        # Salva o modelo treinado
        model_path = os.path.join(model_dir, f'model_{i+1}.pkl')
    
        # Salvando o modelo com pickle
        with open(model_path, 'wb') as file:
            pickle.dump(model, file)

        # plt.figure(figsize=(8, 6))
        # sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        # plt.xlabel('Predicted')
        # plt.ylabel('Actual')
        # plt.title('Confusion Matrix')
        # plt.show()
        print(f'Repetição {i+1} {modelName} - Acurácia: {acuracia:.2f}, Precisão: {precisao:.2f}, Recall: {recall:.2f}, F1-score: {f1:.2f}, ROC AUC: {roc_auc:.2f}')

    # Criar um DataFrame com as métricas
    metrics_df = pd.DataFrame({
        'Accuracy': [np.mean(acuracias)],
        'Precision': [np.mean(precisoes)],
        'Recall': [np.mean(recalls)],
        'F1_score': [np.mean(f1_scores)],
        'Roc_auc': [np.mean(roc_auc_scores)],
        'G-Mean': [np.mean(gmean_scores)],
        'MCC': [np.mean(mcc_scores)],
        'Cohen_Kappa': [np.mean(cohen_scores)],
        'Accuracy_std': [np.std(acuracias)],
        'Precision_std': [np.std(precisoes)],
        'Recall_std': [np.std(recalls)],
        'F1_score_std': [np.std(f1_scores)],
        'Roc_auc_std': [np.std(roc_auc_scores)],
        'G-Mean_std': [np.std(gmean_scores)],
        'MCC_std': [np.std(mcc_scores)],
        'Cohen_Kappa_std': [np.std(cohen_scores)],
    })


    # Define the directory based on the balance condition
    if balance:
        directory = os.path.join("Balanced", "Metrics")
    else:
        directory = os.path.join("Unbalanced", "Metrics")

    # Create the directory if it doesn't exist
    os.makedirs(directory, exist_ok=True)

    # Define the CSV file path
    csv_path = os.path.join(directory, f'{modelName}_metrics.csv')

    # Save the DataFrame to a CSV file
    metrics_df.to_csv(csv_path, index=False)

    # Print the path where the metrics were saved
    print(f'Métricas salvas em {csv_path}')

### Models

#### Unbalanced

In [None]:
for name, classifier in classifiers.items():
    CreateModel(x_all, y_all, classifier, name, False)

In [None]:
for name, classifier in DES.items():
    CreateDESModel(x_all, y_all, classifier, name, balance=False)

#### Balanced

In [None]:
for name, classifier in classifiers.items():
    CreateModel(x_all, y_all, classifier, name, True)

In [11]:
for name, classifier in DES.items():
    CreateDESModel(x_all, y_all, classifier, name, balance=True)

### Results

In [None]:
import os
import pandas as pd

# Define os caminhos das pastas
unbalanced_dir = r"Unbalanced\Metrics"
balanced_dir = r"Balanced\Metrics"
output_dir = r""

# Função para extrair o nome do modelo a partir do nome do arquivo
def extract_model_name(filename):
    return os.path.basename(filename).replace('_metrics.csv', '')

# Função para unificar os CSVs de uma pasta
def unify_csvs(directory, output_filename):
    csv_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('_metrics.csv')]
    dataframes = []
    
    for file in csv_files:
        df = pd.read_csv(file)
        model_name = extract_model_name(file)
        df.insert(0, 'Model', model_name)  # Adiciona a coluna 'Model' com o nome do modelo
        dataframes.append(df)
    
    unified_df = pd.concat(dataframes, ignore_index=True)
    unified_path = os.path.join(output_dir, output_filename)
    unified_df.to_csv(unified_path, index=False)
    return unified_df

# Função para transformar o DataFrame em uma tabela LaTeX
def df_to_latex(df, output_filename):
    latex_table = df.to_latex(index=False)
    with open(os.path.join(output_dir, output_filename), 'w') as f:
        f.write(latex_table)

# Unificar os CSVs e gerar as tabelas LaTeX
unified_unbalanced = unify_csvs(unbalanced_dir, 'UnifiedMetricsUnbalanced.csv')
# df_to_latex(unified_unbalanced, 'LatexUnifiedMetricsUnbalanced.tex')

unified_balanced = unify_csvs(balanced_dir, 'UnifiedMetricsBalanced.csv')
# df_to_latex(unified_balanced, 'LatexUnifiedMetricsBalanced.tex')

print("Processo concluído com sucesso!")


In [None]:
import pandas as pd

# Carregar os CSVs
unbalanced_df = pd.read_csv(r'UnifiedMetricsUnbalanced.csv')
balanced_df = pd.read_csv(r'UnifiedMetricsBalanced.csv')

# Filtrar o dataset balanceado, removendo as linhas com '-2' no nome do modelo
filtered_balanced_df = balanced_df[~balanced_df['Model'].str.endswith('-2')]

# Ajustar o nome dos modelos no balanceado, removendo o sufixo '-2'
filtered_balanced_df['Model'] = filtered_balanced_df['Model'].str.replace('-2', '')

# Ordenar ambos os DataFrames por 'Model' para garantir a comparação correta
unbalanced_df = unbalanced_df.sort_values(by='Model').reset_index(drop=True)
filtered_balanced_df = filtered_balanced_df.sort_values(by='Model').reset_index(drop=True)

# Verificar se as colunas são as mesmas (exceto pela coluna 'Model')
metric_columns = unbalanced_df.columns.difference(['Model'])

# Criar um novo DataFrame para armazenar as diferenças
diff_df = pd.DataFrame()
diff_df['Model'] = unbalanced_df['Model']

# Subtrair as métricas do balanceado pelas métricas do desbalanceado
for col in metric_columns:
    diff_df[col] = filtered_balanced_df[col] - unbalanced_df[col]

# Salvar o DataFrame de diferenças em um novo CSV
diff_df.to_csv(r'DiffMetrics.csv', index=False)

print("Comparação finalizada e salva no arquivo 'caminho_para_o_arquivo_diff.csv'")


In [None]:
import pandas as pd

# Carregar os CSVs
unbalanced_df = pd.read_csv(r'UnifiedMetricsUnbalanced.csv')
balanced_df = pd.read_csv(r'UnifiedMetricsBalanced.csv')

# Filtrar o dataset balanceado para considerar apenas os modelos que terminam em '-2'
models_with_suffix = balanced_df[balanced_df['Model'].str.endswith('-2')]

# Remover o sufixo '-2' para poder comparar com o dataset desbalanceado
models_with_suffix['Model'] = models_with_suffix['Model'].str.replace('-2', '')

# Ordenar ambos os DataFrames por 'Model' para garantir a comparação correta
unbalanced_df = unbalanced_df.sort_values(by='Model').reset_index(drop=True)
models_with_suffix = models_with_suffix.sort_values(by='Model').reset_index(drop=True)

# Verificar se as colunas são as mesmas (exceto pela coluna 'Model')
metric_columns = unbalanced_df.columns.difference(['Model'])

# Criar um novo DataFrame para armazenar as diferenças
diff_df = pd.DataFrame()
diff_df['Model'] = models_with_suffix['Model']

# Subtrair as métricas do balanceado com sufixo '-2' pelas métricas do desbalanceado
for col in metric_columns:
    diff_df[col] = models_with_suffix[col] - unbalanced_df[col]

# Salvar o DataFrame de diferenças em um novo CSV
diff_df.to_csv(r'DiffMetrics-2.csv', index=False)

print("Comparação finalizada e salva no arquivo 'caminho_para_o_arquivo_diff_modelo_2.csv'")


In [None]:
import pandas as pd

# Carregar os CSVs de diferenças de métricas
diff_df_normal = pd.read_csv(r"DiffMetrics.csv")
diff_df_modelo_2 = pd.read_csv(r"DiffMetrics-2.csv")

# Adicionar o sufixo '-2' aos modelos provenientes do arquivo diff_modelo_2_comparison.csv
diff_df_modelo_2['Model'] = diff_df_modelo_2['Model'] + '-2'

# Unir os dois DataFrames
combined_diff_df = pd.concat([diff_df_normal, diff_df_modelo_2], ignore_index=True)

# Dropar as colunas que terminam com '_std'
columns_to_drop = [col for col in combined_diff_df.columns if col.endswith('_std')]
final_df = combined_diff_df.drop(columns=columns_to_drop)

# Salvar o resultado em um novo CSV
final_df.to_csv(r"DiffMetricsFinal.csv", index=False)

print("Unificação finalizada e salva no arquivo 'combined_diff_without_std.csv'")

In [5]:
import pandas as pd

# Função para limitar casas decimais nas colunas que existem no DataFrame
def limitar_decimais(df, colunas, casas_decimais=4):
    colunas_existentes = [col for col in colunas if col in df.columns]
    for coluna in colunas_existentes:
        df[coluna] = df[coluna].apply(lambda x: f"{x:.{casas_decimais}f}" if pd.notnull(x) else x)
    return df

# Carregar os três arquivos CSV
csv1 = pd.read_csv(r'UnifiedMetricsUnbalanced.csv')
csv2 = pd.read_csv(r'UnifiedMetricsBalanced.csv')

# Colunas de métricas (excluindo a coluna 'Model')
colunas_metricas = [col for col in csv1.columns if col != 'Model']

# Aplicar o limite de casas decimais
csv1_formatado = limitar_decimais(csv1, colunas_metricas)
csv2_formatado = limitar_decimais(csv2, colunas_metricas)

# Salvar os CSVs formatados
csv1_formatado.to_csv(r'UnifiedMetricsUnbalanced.csv', index=False)
csv2_formatado.to_csv(r'UnifiedMetricsBalanced.csv', index=False)


In [6]:
import pandas as pd

# Função para limitar casas decimais
def limitar_decimais(df, colunas, casas_decimais=4):
    for coluna in colunas:
        df[coluna] = df[coluna].apply(lambda x: f"{x:.{casas_decimais}f}")
    return df

# Carregar os três arquivos CSV

csv3 = pd.read_csv(r"DiffMetricsFinal.csv")

# Colunas de métricas (excluindo a coluna 'Model')
colunas_metricas = [col for col in csv3.columns if col != 'Model']

csv3_formatado = limitar_decimais(csv3, colunas_metricas)

# Salvar os CSVs formatados
csv3_formatado.to_csv(r"DiffMetricsFinal.csv", index=False)


In [None]:
import pandas as pd

# Função para unir as métricas e seus desvios padrões
def merge_metrics_with_std(file_path):
    # Ler o CSV
    df = pd.read_csv(file_path)
    
    # Listar as métricas que possuem desvios padrões
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1_score', 'Roc_auc', 'G-Mean', 'MCC', 'Cohen_Kappa']
    
    # Para cada métrica, unir com seu desvio padrão no formato 'métrica(desvio)'
    for metric in metrics:
        std_column = f"{metric}_std"
        df[metric] = df[metric].round(4).astype(str) + "(" + df[std_column].round(4).astype(str) + ")"
        
        # Remover a coluna do desvio padrão após a união
        df.drop(columns=[std_column], inplace=True)
    
    # Salvar o CSV modificado no mesmo local
    df.to_csv(file_path, index=False)

# Caminhos dos arquivos
unbalanced_path = r"UnifiedMetricsUnbalanced.csv"
balanced_path = r"UnifiedMetricsBalanced.csv"

# Processar os arquivos
merge_metrics_with_std(unbalanced_path)
merge_metrics_with_std(balanced_path)

print("Arquivos processados com sucesso!")


In [None]:
import pandas as pd

# Carregar os CSVs
unbalanced_df = pd.read_csv(r'UnifiedMetricsUnbalanced.csv')
balanced_df = pd.read_csv(r'UnifiedMetricsBalanced.csv')
combined_diff_df = pd.read_csv(r"DiffMetricsFinal.csv")

unbalanced_df = unbalanced_df.drop(['Accuracy', 'Precision', 'Cohen_Kappa'], axis=1)
balanced_df = balanced_df.drop(['Accuracy', 'Precision', 'Cohen_Kappa'], axis=1)
combined_diff_df = combined_diff_df.drop(['Accuracy', 'Precision', 'Cohen_Kappa'], axis=1)

unbalanced_df.to_csv(r'UnifiedMetricsUnbalanced.csv')
balanced_df.to_csv(r'UnifiedMetricsBalanced.csv')
combined_diff_df.to_csv(r"DiffMetricsFinal.csv")

# Função para converter DataFrame em formato LaTeX e salvar em um arquivo .tex
def df_to_latex(df, filename):
    with open(filename, 'w') as f:
        f.write(df.to_latex(index=False))

# Converter os DataFrames para LaTeX e salvar em arquivos .tex
df_to_latex(balanced_df,  r'UnifiedMetricsBalanced.tex')
df_to_latex(unbalanced_df,  r'UnifiedMetricsUnbalanced.tex')
df_to_latex(combined_diff_df, r"DiffMetricsFinal.tex")

print("Conversão para LaTeX finalizada e salva nos arquivos .tex.")


In [9]:
# import pandas as pd

# # Função para realizar a comparação entre os datasets balanceado e desbalanceado
# def compare_datasets(unbalanced_path, balanced_path, output_path):
#     # Carregar os CSVs
#     df_unbalanced = pd.read_csv(unbalanced_path)
#     df_balanced = pd.read_csv(balanced_path)

#     # Filtrar as linhas do balanceado que não terminam com '-2' na coluna Model
#     df_balanced_filtered = df_balanced[~df_balanced['Model'].str.endswith('-2')]

#     # Garantir que as colunas sejam as mesmas, exceto 'Model'
#     common_columns = [col for col in df_unbalanced.columns if col != 'Model']

#     for col in common_columns:
#         df_unbalanced[col] = df_unbalanced[col].apply(lambda x: float(x.split('(')[0]))
#         df_balanced_filtered[col] = df_balanced_filtered[col].apply(lambda x: float(x.split('(')[0]))

#     # Subtrair os valores entre os dois DataFrames
#     df_comparison = df_balanced_filtered.copy()
#     df_comparison[common_columns] = df_balanced_filtered[common_columns].values - df_unbalanced[common_columns].values

#     # Salvar o resultado em um novo CSV
#     df_comparison.to_csv(output_path, index=False)

# # Caminhos dos arquivos
# unbalanced_path = r"E:\DrebinStudy\UnifiedMetricsUnbalanced.csv"
# balanced_path = r"E:\DrebinStudy\UnifiedMetricsBalanced.csv"
# output_path = r"E:\DrebinStudy\ComparisonBalancedVsUnbalanced.csv"

# # Comparar os datasets
# compare_datasets(unbalanced_path, balanced_path, output_path)

# print("Arquivo de comparação gerado com sucesso!")


In [10]:
# import pandas as pd

# # Caminho para o arquivo CSV
# file_path = r'E:\DrebinStudy\ComparisonBalancedVsUnbalanced.csv'

# # Carregar o CSV
# df = pd.read_csv(file_path)

# # Ajustar os números para ter até 4 casas decimais
# df = df.applymap(lambda x: round(x, 4) if isinstance(x, (int, float)) else x)

# # Salvar o CSV modificado
# output_path = r'E:\DrebinStudy\ComparisonBalancedVsUnbalanced.csv'
# df.to_csv(output_path, index=False)

# print(f'Arquivo salvo em: {output_path}')
