In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#import mplcursors
from matplotlib.lines import Line2D
import os

%matplotlib

Using matplotlib backend: <object object at 0x00000203b061c728>


In [5]:

# Path to the folder containing the files
folder_path = 'NormalvsTumor_HILIC_ALL/'

# List all files in the folder
file_names = os.listdir(folder_path)

# Create a dictionary to store DataFrames with variable names
data_frames = {}

# Loop through the files and read them into pandas DataFrames
for file_name in file_names:
    # Assuming your files are CSV, modify the extension accordingly if needed
    if file_name.endswith('.csv'):
        # Remove the extension to create a variable name
        variable_name = os.path.splitext(file_name)[0]
        # Remove specified prefixes and suffixes from the variable name
        variable_name = variable_name.replace('CRC_HILIC_', '').replace('_Ttest', '')
        
        print(variable_name)

        # Read the file into a DataFrame and store it in the dictionary
        file_path = os.path.join(folder_path, file_name)
        try:
            data_frames[variable_name] = pd.read_csv(file_path, encoding='utf-8')
        except UnicodeDecodeError:
            # Try a different encoding if 'utf-8' fails
            try:
                data_frames[variable_name] = pd.read_csv(file_path, encoding='latin1')
            except UnicodeDecodeError:
                print(f"Error reading file '{file_name}': Unable to decode using 'utf-8' or 'latin1'.")



asceding


ParserError: Error tokenizing data. C error: Expected 93 fields in line 10131, saw 95


In [None]:
# Create a dictionary to store the results
results_dict = {}
selected_mz_value = 694.31494
# Iterate through the DataFrames
for variable_name, df in data_frames.items():
    # Check if the variable name contains 'output'
    if 'output' in variable_name:
        # Assuming 'mz' is a column in the DataFrame
        selected_row_output = df[df['mz'] == selected_mz_value]

        if not selected_row_output.empty:
            raw_pval = selected_row_output['raw_pval'].values[0]
            q_fdr = selected_row_output['q_fdr'].values[0]
            log_fc_matched = selected_row_output['log_fc_matched'].values[0]
            log_fc_matched = float(log_fc_matched)
            q_fdr = float(q_fdr)

            # Check q_fdr and assign stars accordingly
            if q_fdr < 0.05 and q_fdr > 0.01:
                q_fdr_stars = '*'
            elif q_fdr < 0.01 and q_fdr > 0.001:
                q_fdr_stars = '**'
            elif q_fdr < 0.001:
                q_fdr_stars = '***'
            # Save the results in the dictionary
            results_dict[variable_name] = {
                'raw_pval': raw_pval,
                'q_fdr': q_fdr,
                'log_fc_matched': log_fc_matched,
                'q_fdr_stars': q_fdr_stars
            }
        else:
            print(f"No data found for the selected 'mz' value in DataFrame '{variable_name}'.")
            
for variable_name, results in results_dict.items():
    print(f"\nResults for DataFrame '{variable_name}':")
    print(f"Raw P-value: {results['raw_pval']}")
    print(f"Q FDR: {results['q_fdr']}")
    print(f"Log FC Matched: {results['log_fc_matched']}")
    print(f"Q FDR Stars: {results['q_fdr_stars']}")


In [None]:
for variable_name, df in data_frames.items():
    # Check if the variable name contains 'output'
    # Extract results from the dictionary
    if variable_name in results_dict:
        q_fdr_stars = results_dict[variable_name]['q_fdr_stars']
        # Ensure that log_fc_matched is a float or set it to 'N/A' otherwise
        try:
            log_fc_matched = float(results_dict[variable_name]['log_fc_matched'])
        except ValueError:
            log_fc_matched = "N/A"


    # Plot diagrams only for DataFrames without 'output' in their names
    if 'output' not in variable_name:
        # Use the existing code to create the boxplot and swarmplot

        desired_row_index = df.index[df['mz'] == selected_mz_value].tolist()

        if len(desired_row_index) == 1:
            desired_row_index = desired_row_index[0] + 1

            row_data = df.iloc[desired_row_index - 1]
            case_columns = row_data.filter(like='_Case').tolist()
            control_columns = row_data.filter(like='_Control').tolist()

            fig, ax = plt.subplots(figsize=(3, 4))

            ax.scatter([1] * len(case_columns), case_columns, color="red", label="Tumor", s=3)
            ax.scatter([2] * len(control_columns), control_columns, color="green", label="Normal", s=3)

            ax.boxplot([case_columns, control_columns], labels=['Tumor', 'Normal'], patch_artist=True,
                       boxprops=dict(facecolor='white', alpha=0.5, color='black', linewidth=1),
                       medianprops=dict(color='black'), showfliers=False)

            ax.set_xticklabels(['Tumor', 'Normal'], rotation=90)
            ax.set_xlabel(variable_name, fontsize=12, fontweight='bold')
            ax.set_ylabel('Relative Abundance')
            ax.set_title(f'm/z={selected_mz_value:.4f}', fontsize=12, fontweight='bold')

            legend_elements = [
                Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=10, label='Case'),
                Line2D([0], [0], marker='o', color='w', markerfacecolor='green', markersize=10, label='Control'),
            ]

            ax.text(1.01, 0.94, f'q:{q_fdr_stars}\nLogFC:{log_fc_matched:.2f}', verticalalignment='center', horizontalalignment='left',
                    transform=ax.transAxes, color='black', fontsize=8)

            plt.tight_layout()
             # Add mplcursors annotation
            mplcursors.cursor(hover=True)
            plt.savefig(f'boxplot_swarmplot_{variable_name}.png', dpi=500, bbox_inches='tight')
            plt.show()
            print(f"Plot done for DataFrame '{variable_name}'")

        else:
            print(f"No unique row found for the selected 'mz' value in DataFrame '{variable_name}'.")