In [3]:
# Analysis4NLP.ipynb

import os
import pandas as pd

# Define the base directory and folder names
base_dir = 'Results/Classification'
folders = ['DeepEST', 'GBS', 'RHC-S', 'SSRS', 'SUPS']
datasets = ['imdb300AuxDS', 'imdbAuxDS', 'SSTIMDB3000AuxDS', 'SSTtestAuxDS']
aux_vars = ['confidence', 'dsa', 'entropy', 'lsa', 'similarity']
budgets = [50, 100, 200, 400, 800]

# Define the output directory for aggregated results
output_dir = 'DS4NLP_results'

# Create the output directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Function to read data based on file extension
def read_data(file_path):
    if file_path.endswith('.csv'):
        return pd.read_csv(file_path)
    elif file_path.endswith('.txt'):
        return pd.read_csv(file_path, delimiter=',')
    else:
        raise ValueError(f"Unsupported file format: {file_path}")

# Iterate over each folder, dataset, and auxiliary variable
for folder in folders:
    folder_path = os.path.join(base_dir, folder)
    
    # Create a subdirectory in the output directory for each folder
    output_folder_path = os.path.join(output_dir, folder)
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    for dataset in datasets:
        for aux_var in aux_vars:
            # Prepare a list to collect dataframes for aggregation
            dataframes = []
            for budget in budgets:
                # Construct file names differently for DeepEST and others
                if folder == 'DeepEST':
                    file_name = f"{dataset}.{aux_var}_{budget}.csv"  # Use period for DeepEST
                else:
                    file_name = f"{dataset}_{aux_var}_{budget}.txt"  # Use underscore for others

                file_path = os.path.join(folder_path, file_name)
                
                # Check if the file exists
                if os.path.exists(file_path):
                    # Read the data and add a budget column
                    try:
                        data = read_data(file_path)
                        data['budget'] = budget
                        dataframes.append(data)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
                        continue
                else:
                    print(f"File not found: {file_name} in {folder}")
                    continue

            # Concatenate all dataframes for this dataset and auxiliary variable
            if dataframes:
                aggregated_data = pd.concat(dataframes, ignore_index=True)
                # Save the aggregated data to a new CSV file
                output_file = f"{dataset}_{aux_var}_agg.csv"
                output_path = os.path.join(output_folder_path, output_file)
                try:
                    aggregated_data.to_csv(output_path, index=False)
                    print(f"Aggregated data saved to {output_path}")
                except Exception as e:
                    print(f"Error saving aggregated data for {dataset} - {aux_var}: {e}")

print("Data aggregation completed.")

Aggregated data saved to DS4NLP_results/DeepEST/imdb300AuxDS_confidence_agg.csv
Aggregated data saved to DS4NLP_results/DeepEST/imdb300AuxDS_dsa_agg.csv
Aggregated data saved to DS4NLP_results/DeepEST/imdb300AuxDS_entropy_agg.csv
Aggregated data saved to DS4NLP_results/DeepEST/imdb300AuxDS_lsa_agg.csv
Aggregated data saved to DS4NLP_results/DeepEST/imdb300AuxDS_similarity_agg.csv
Aggregated data saved to DS4NLP_results/DeepEST/imdbAuxDS_confidence_agg.csv
Aggregated data saved to DS4NLP_results/DeepEST/imdbAuxDS_dsa_agg.csv
Aggregated data saved to DS4NLP_results/DeepEST/imdbAuxDS_entropy_agg.csv
Aggregated data saved to DS4NLP_results/DeepEST/imdbAuxDS_lsa_agg.csv
Aggregated data saved to DS4NLP_results/DeepEST/imdbAuxDS_similarity_agg.csv
Aggregated data saved to DS4NLP_results/DeepEST/SSTIMDB3000AuxDS_confidence_agg.csv
Aggregated data saved to DS4NLP_results/DeepEST/SSTIMDB3000AuxDS_dsa_agg.csv
Aggregated data saved to DS4NLP_results/DeepEST/SSTIMDB3000AuxDS_entropy_agg.csv
Aggrega

Aggregated data saved to DS4NLP_results/SUPS/imdbAuxDS_confidence_agg.csv
Aggregated data saved to DS4NLP_results/SUPS/imdbAuxDS_dsa_agg.csv
Aggregated data saved to DS4NLP_results/SUPS/imdbAuxDS_entropy_agg.csv
Aggregated data saved to DS4NLP_results/SUPS/imdbAuxDS_lsa_agg.csv
Aggregated data saved to DS4NLP_results/SUPS/imdbAuxDS_similarity_agg.csv
Aggregated data saved to DS4NLP_results/SUPS/SSTIMDB3000AuxDS_confidence_agg.csv
Aggregated data saved to DS4NLP_results/SUPS/SSTIMDB3000AuxDS_dsa_agg.csv
Aggregated data saved to DS4NLP_results/SUPS/SSTIMDB3000AuxDS_entropy_agg.csv
Aggregated data saved to DS4NLP_results/SUPS/SSTIMDB3000AuxDS_lsa_agg.csv
Aggregated data saved to DS4NLP_results/SUPS/SSTIMDB3000AuxDS_similarity_agg.csv
Aggregated data saved to DS4NLP_results/SUPS/SSTtestAuxDS_confidence_agg.csv
Aggregated data saved to DS4NLP_results/SUPS/SSTtestAuxDS_dsa_agg.csv
Aggregated data saved to DS4NLP_results/SUPS/SSTtestAuxDS_entropy_agg.csv
Aggregated data saved to DS4NLP_result

In [4]:
import os
import pandas as pd

# Define the base directory and folder names
base_dir = 'Results/Classification'
folders_to_process = ['GBS', 'RHC-S', 'SSRS']

# Define the datasets and auxiliary variables
datasets = ['imdb300AuxDS', 'imdbAuxDS', 'SSTIMDB3000AuxDS', 'SSTtestAuxDS']
aux_vars = ['Confidence_Score', 'DSA', 'LSA', 'Prediction_Entropy', 'Similarity_Score']
budgets = [50, 100, 200, 400, 800]

# Define the output directory for aggregated results
output_dir = 'DS4NLP_results'

# Create the output directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Function to read data based on file extension
def read_data(file_path):
    if file_path.endswith('.csv'):
        return pd.read_csv(file_path)
    elif file_path.endswith('.txt'):
        return pd.read_csv(file_path, delimiter=',')
    else:
        raise ValueError(f"Unsupported file format: {file_path}")

# Iterate over each specified folder
for folder in folders_to_process:
    folder_path = os.path.join(base_dir, folder)
    
    # Create a subdirectory in the output directory for each folder
    output_folder_path = os.path.join(output_dir, folder)
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    for dataset in datasets:
        for aux_var in aux_vars:
            # Prepare a list to collect dataframes for aggregation
            dataframes = []
            for budget in budgets:
                # Construct file name for txt files
                file_name = f"{dataset}_{aux_var}_{budget}.txt"
                file_path = os.path.join(folder_path, file_name)
                
                # Check if the file exists
                if os.path.exists(file_path):
                    # Read the data and add a budget column
                    try:
                        data = read_data(file_path)
                        data['budget'] = budget
                        dataframes.append(data)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
                        continue
                else:
                    print(f"File not found: {file_name} in {folder}")
                    continue

            # Concatenate all dataframes for this dataset and auxiliary variable
            if dataframes:
                aggregated_data = pd.concat(dataframes, ignore_index=True)
                # Save the aggregated data to a new CSV file
                output_file = f"{dataset}_{aux_var}_agg.csv"
                output_path = os.path.join(output_folder_path, output_file)
                try:
                    aggregated_data.to_csv(output_path, index=False)
                    print(f"Aggregated data saved to {output_path}")
                except Exception as e:
                    print(f"Error saving aggregated data for {dataset} - {aux_var}: {e}")

print("Data aggregation completed.")

Aggregated data saved to DS4NLP_results/GBS/imdb300AuxDS_Confidence_Score_agg.csv
Aggregated data saved to DS4NLP_results/GBS/imdb300AuxDS_DSA_agg.csv
Aggregated data saved to DS4NLP_results/GBS/imdb300AuxDS_LSA_agg.csv
Aggregated data saved to DS4NLP_results/GBS/imdb300AuxDS_Prediction_Entropy_agg.csv
Aggregated data saved to DS4NLP_results/GBS/imdb300AuxDS_Similarity_Score_agg.csv
Aggregated data saved to DS4NLP_results/GBS/imdbAuxDS_Confidence_Score_agg.csv
Aggregated data saved to DS4NLP_results/GBS/imdbAuxDS_DSA_agg.csv
Aggregated data saved to DS4NLP_results/GBS/imdbAuxDS_LSA_agg.csv
Aggregated data saved to DS4NLP_results/GBS/imdbAuxDS_Prediction_Entropy_agg.csv
Aggregated data saved to DS4NLP_results/GBS/imdbAuxDS_Similarity_Score_agg.csv
Aggregated data saved to DS4NLP_results/GBS/SSTIMDB3000AuxDS_Confidence_Score_agg.csv
Aggregated data saved to DS4NLP_results/GBS/SSTIMDB3000AuxDS_DSA_agg.csv
Aggregated data saved to DS4NLP_results/GBS/SSTIMDB3000AuxDS_LSA_agg.csv
Aggregated 