In [None]:
######################################### Remove any occurance of "Others"  ############################
import os
import pandas as pd

def load_check_and_save_csv(input_folder, output_folder, column_name):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Iterate through all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".csv"):
            input_file_path = os.path.join(input_folder, filename)
            # Read the CSV file
            df = pd.read_csv(input_file_path)
            # Check if the specified column exists in the dataframe
            if column_name in df.columns:
                # Filter out rows containing "other" or "Other" in the specified column
                filtered_df = df[~df[column_name].str.contains('other', case=False, na=False)]
                # Save the filtered dataframe to the output folder
                output_file_path = os.path.join(output_folder, filename)
                filtered_df.to_csv(output_file_path, index=False)
                print(f"Processed and saved {filename}")
            else:
                print(f"Column '{column_name}' not found in {filename}")

# Example usage
input_folder = 'Dataset/AdjuvareDB104_Standard/10_folds/'  # Replace with the path to your input folder
output_folder = 'Dataset/AdjuvareDB104_Standard/10_folds_preprocessed/'  # Replace with the path to your output folder
column_name = 'Adjuvant Name'    
load_check_and_save_csv(input_folder, output_folder, column_name)

In [None]:
import os
import pandas as pd
import re

def check_special_characters(folder_path, nct_column):
    # Define a regex pattern for special characters (excluding common punctuation and spaces)
    special_char_pattern = re.compile(r'[^a-zA-Z0-9\s,.\-\'+*/=()|:_;"]')

    # Iterate through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            # Read the CSV file
            df = pd.read_csv(file_path)
            # Check if the NCT column exists in the dataframe
            if nct_column in df.columns:
                # Iterate over each row in the dataframe
                for index, row in df.iterrows():
                    special_chars_found = {}
                    # Check each column in the row for special characters
                    for col in df.columns:
                        cell_value = str(row[col])
                        words = cell_value.split()  # Split cell value into words
                        for word in words:
                            special_chars = special_char_pattern.findall(word)
                            if special_chars:
                                # Add the word and special characters to the dictionary
                                if col not in special_chars_found:
                                    special_chars_found[col] = []
                                special_chars_found[col].append((word, special_chars))
                    # If special characters are found, print them with the NCT Number
                    if special_chars_found:
                        nct_number = row[nct_column]
                        print(f"NCT Number: {nct_number}")
                        for col, items in special_chars_found.items():
                            print(f"  Column: {col}")
                            for word, chars in items:
                                print(f"    Word: {word}, Special Characters: {chars}")
            else:
                print(f"Column '{nct_column}' not found in {filename}")

# Example usage
folder_path = 'Dataset/AdjuvareDB104_Standard/10_folds_preprocessed/'  # Replace with the path to your folder
nct_column = 'NCT Number'           
check_special_characters(folder_path, nct_column)


In [None]:
########################################## Remove ® and ™   #####################################################

import os
import pandas as pd
import re

def clean_and_save_files(folder_path, nct_column, save_folder):

    # Ensure save directory exists, create if not
    os.makedirs(save_folder, exist_ok=True)

    # Iterate through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            # Read the CSV file
            df = pd.read_csv(file_path)
            
            # Check if the NCT column exists in the dataframe
            if nct_column in df.columns:
                # Iterate over each row in the dataframe
                for index, row in df.iterrows():
                    # Check and clean each column in the row
                    for col in df.columns:
                        cell_value = str(row[col])
                        # Remove ® and ™
                        cleaned_value = cell_value.replace("®", "").replace("™", "")
                        df.at[index, col] = cleaned_value

                # Save the cleaned DataFrame to a new CSV file
                save_path = os.path.join(save_folder, filename)
                df.to_csv(save_path, index=False)
                print(f"Cleaned and saved {filename} to {save_path}")
            else:
                print(f"Column '{nct_column}' not found in {filename}")

# Example usage
folder_path = 'Dataset/AdjuvareDB104_Standard/10_folds_preprocessed/'  # Replace with the path to your folder
nct_column = 'NCT Number'          
save_folder = 'Dataset/AdjuvareDB104_Standard/10_folds_preprocessed/'  # Specify where to save cleaned files

clean_and_save_files(folder_path, nct_column, save_folder)


In [None]:
folder_path = 'Dataset/AdjuvareDB104_Standard/10_folds_preprocessed/'  # Replace with the path to your folder
nct_column = 'NCT Number'       
check_special_characters(folder_path, nct_column)

In [None]:
######################################## 10 folds with interventions merged columns ##################################

import os
import pandas as pd

# Define the input and output directories
input_dir = 'Dataset/AdjuvareDB104_Standard/10_folds_preprocessed_with_intervention/'
output_dir = 'Dataset/AdjuvareDB104_Standard/10_folds_preprocessed_with_intervention_merged_columns/'
# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Function to ensure each component ends with a period
def ensure_period(s):
    return s if s.endswith('.') else s + '.'

# Process each file in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.txt'):
        # Load the CSV file
        input_filepath = os.path.join(input_dir, filename)
        df = pd.read_csv(input_filepath, sep="\t")
        
        # Create the merged column
        df['Study Summary'] = ('Study Title: ' + df['Study Title'].apply(ensure_period) +
                               ' Brief Summary: ' + df['Brief Summary'].apply(ensure_period) +
                               ' Interventions: ' + df['Interventions'].apply(ensure_period))
        
        # Select only the necessary columns
        df_output = df[['NCT Number', 'Study Summary']]
        
        # Save the modified DataFrame
        output_filepath = os.path.join(output_dir, filename)
        df_output.to_csv(output_filepath, sep="\t", index=False)

print("Files processed and saved successfully.")


In [None]:
######################################## 10 folds without interventions merged columns ##################################

import os
import pandas as pd

# Define the input and output directories
input_dir = 'Dataset/AdjuvareDB104_Standard/10_folds_preprocessed_without_intervention/'
output_dir = 'Dataset/AdjuvareDB104_Standard/10_folds_preprocessed_without_intervention_merged_columns/'
# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Function to ensure each component ends with a period
def ensure_period(s):
    return s if s.endswith('.') else s + '.'

# Process each file in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.txt'):
        # Load the CSV file
        input_filepath = os.path.join(input_dir, filename)
        df = pd.read_csv(input_filepath, sep="\t")
        
        # Create the merged column
        df['Study Summary'] = ('Study Title: ' + df['Study Title'].apply(ensure_period) +
                               ' Brief Summary: ' + df['Brief Summary'].apply(ensure_period))
        
        # Select only the necessary columns
        df_output = df[['NCT Number', 'Study Summary']]
        
        # Save the modified DataFrame
        output_filepath = os.path.join(output_dir, filename)
        df_output.to_csv(output_filepath, sep="\t", index=False)

print("Files processed and saved successfully.")
