In [1]:
import pandas as pd

In [2]:
def clean_csv_file(path: str) -> str or None:
    """
    Reads and cleans a CSV file, providing options to return the cleaned data as a string or save it into a file.

    Parameters:
        path (str): The path to the CSV file to be cleaned.

    Returns:
        str or None: If the user chooses to get the results as a variable (1),
        the cleaned data is returned as a string. If the user chooses to save the
        results into a file (2), the cleaned data is saved into a file.

    Usage:
        1. `path` should be the path to the CSV file that needs to be cleaned.
        2. The function interactively prompts the user to choose between getting
           the results as a variable or saving them into a file.
        3. If the user selects to save the results into a file, they are further
           prompted to choose between saving only keys or in Fasta format.
        4. If the user selects to save in Fasta format, the data is saved in a
           '.fasta' file with each entry represented as a Fasta sequence.
        5. If the user selects to save only keys, the data is saved in a '.txt'
           file with each key on a separate line.

    Notes:
        - If the user does not provide a valid input for any prompt, they are
          repeatedly prompted until a valid input is provided.
        - If the user does not provide a file name when prompted for the output
          file name, a default name "output_seq" is used.
        - The function utilizes the Pandas library to read and manipulate CSV data.
        - The function utilizes Python's built-in file handling capabilities to
          save the data into text files.
    """
    break_line = '\n------------------------------\n'
    print('Reading the file...')
    df = pd.read_csv(path)
    print('Cleaning the CSV file...')
    df = df.dropna(subset=['Entity ID'])
    df = df.drop(columns=['Unnamed: 3'])
    df['Entity ID'] = df['Entity ID'].str.split('_').str[0] + ':' + df['Auth Asym ID']
    df = df.drop(columns=['Auth Asym ID'])
    df = df.reset_index(drop=True)
    print('Done!', break_line)

    while True:
        which_output = input('Do you want to get the results as variable(1) or file(2)?\nOnly enter the corresponding number[1, 2]: ')
        if which_output in ['1', '2']:
            which_output = int(which_output)
            break
        else:
            print('Invalid input. Please enter either 1 or 2.' + break_line)

    # Save the output into a variable
    if which_output == 1:
        return '\n'.join(df['Entity ID'].values)

    # Save the output into a file
    elif which_output == 2:
        print(break_line)
        output_file_name = input('Enter your output file name without extension (Press Enter for default "output_seq"): ')
        output_file_name = output_file_name.strip()
        if output_file_name == "":
            output_file_name = "output_seq"

        while True:
            with_fasta = input('[1] Only keys \\ [2] As Fasta format: ')
            if with_fasta in ['1', '2']:
                with_fasta = int(with_fasta)
                break
            else:
                print('Invalid input. Please enter either 1 or 2.' + break_line)
        # Save as .fasta
        if with_fasta == 2:
            with open(output_file_name + '.fasta', 'w') as file:
                for idx, row in df.iterrows():
                    file.write(f"> {row['Entity ID']}\n{row['Sequence']}\n")
            print(break_line, 'Data saved to', output_file_name + '.fasta')
        # Save as .keys
        elif with_fasta == 1:
            with open(output_file_name + '.txt', 'w') as f:
                f.write('\n'.join(df['Entity ID'].values))
            print(break_line, 'Data saved to', output_file_name + '.keys')


In [None]:
path = '/content/rcsb_pdb_custom_report_20240411062134.csv'

In [None]:
clean_csv_file()