In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#import mplcursors
from matplotlib.lines import Line2D
import os
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [10]:

# Path to the folder containing the files
folder_path = 'NormalvsTumor_HILIC_ALL/'

# List all files in the folder
file_names = os.listdir(folder_path)

# Create a dictionary to store DataFrames with variable names
data_frames = {}

# Loop through the files and read them into pandas DataFrames
for file_name in file_names:
    # Assuming your files are CSV, modify the extension accordingly if needed
    if file_name.endswith('.csv'):
        # Remove the extension to create a variable name
        variable_name = os.path.splitext(file_name)[0]
        # Remove specified prefixes and suffixes from the variable name
        variable_name = variable_name.replace('CRC_HILIC_', '').replace('_Ttest', '')
        
        print(variable_name)

        # Read the file into a DataFrame and store it in the dictionary
        file_path = os.path.join(folder_path, file_name)
        try:
            data_frames[variable_name] = pd.read_csv(file_path, encoding='utf-8')
        except UnicodeDecodeError:
            # Try a different encoding if 'utf-8' fails
            try:
                data_frames[variable_name] = pd.read_csv(file_path, encoding='latin1')
            except UnicodeDecodeError:
                print(f"Error reading file '{file_name}': Unable to decode using 'utf-8' or 'latin1'.")



asceding
asceding_output


  data_frames[variable_name] = pd.read_csv(file_path, encoding='latin1')


cecum
cecum_output


  data_frames[variable_name] = pd.read_csv(file_path, encoding='latin1')


descending
descending_output


  data_frames[variable_name] = pd.read_csv(file_path, encoding='latin1')


Rectosigmoid
Rectosigmoid_output


  data_frames[variable_name] = pd.read_csv(file_path, encoding='latin1')


rectum
rectum_output


  data_frames[variable_name] = pd.read_csv(file_path, encoding='latin1')


sigmoid
sigmoid_output


  data_frames[variable_name] = pd.read_csv(file_path, encoding='latin1')


transvrse
transvrse_output


  data_frames[variable_name] = pd.read_csv(file_path, encoding='latin1')


In [11]:
# Create a dictionary to store the results
results_dict = {}
selected_mz_value = 694.31494
# Iterate through the DataFrames
for variable_name, df in data_frames.items():
    # Check if the variable name contains 'output'
    if 'output' in variable_name:
        # Assuming 'mz' is a column in the DataFrame
        selected_row_output = df[df['mz'] == selected_mz_value]

        if not selected_row_output.empty:
            raw_pval = selected_row_output['raw_pval'].values[0]
            q_fdr = selected_row_output['q_fdr'].values[0]
            log_fc_matched = selected_row_output['log_fc_matched'].values[0]
            log_fc_matched = float(log_fc_matched)
            q_fdr = float(q_fdr)

            # Check q_fdr and assign stars accordingly
            if q_fdr < 0.05 and q_fdr > 0.01:
                q_fdr_stars = '*'
            elif q_fdr < 0.01 and q_fdr > 0.001:
                q_fdr_stars = '**'
            elif q_fdr < 0.001:
                q_fdr_stars = '***'
            # Save the results in the dictionary
            results_dict[variable_name] = {
                'raw_pval': raw_pval,
                'q_fdr': q_fdr,
                'log_fc_matched': log_fc_matched,
                'q_fdr_stars': q_fdr_stars
            }
        else:
            print(f"No data found for the selected 'mz' value in DataFrame '{variable_name}'.")
            
for variable_name, results in results_dict.items():
    print(f"\nResults for DataFrame '{variable_name}':")
    print(f"Raw P-value: {results['raw_pval']}")
    print(f"Q FDR: {results['q_fdr']}")
    print(f"Log FC Matched: {results['log_fc_matched']}")
    print(f"Q FDR Stars: {results['q_fdr_stars']}")



Results for DataFrame 'asceding_output':
Raw P-value: 4.81e-05
Q FDR: 0.000822962
Log FC Matched: 0.878655926
Q FDR Stars: ***

Results for DataFrame 'cecum_output':
Raw P-value: 0.000499091
Q FDR: 0.00373155
Log FC Matched: 0.760405543
Q FDR Stars: **

Results for DataFrame 'descending_output':
Raw P-value: 0.099279508
Q FDR: 0.241825205
Log FC Matched: 0.412915339
Q FDR Stars: **

Results for DataFrame 'Rectosigmoid_output':
Raw P-value: 7.23e-06
Q FDR: 0.000134968
Log FC Matched: 0.80442154
Q FDR Stars: ***

Results for DataFrame 'rectum_output':
Raw P-value: 2.99e-09
Q FDR: 3.97e-08
Log FC Matched: 0.77132652
Q FDR Stars: ***

Results for DataFrame 'sigmoid_output':
Raw P-value: 2.91e-05
Q FDR: 0.000199641
Log FC Matched: 0.585796587
Q FDR Stars: ***

Results for DataFrame 'transvrse_output':
Raw P-value: 0.019448095
Q FDR: 0.091903558
Log FC Matched: 0.811725981
Q FDR Stars: ***


In [13]:


for variable_name, df in data_frames.items():
    # Check if the variable name contains 'output'
    # Extract results from the dictionary
    if variable_name in results_dict:
        q_fdr_stars = results_dict[variable_name]['q_fdr_stars']
        # Ensure that log_fc_matched is a float or set it to 'N/A' otherwise
        try:
            log_fc_matched = float(results_dict[variable_name]['log_fc_matched'])
        except ValueError:
            log_fc_matched = "N/A"

    # Plot diagrams only for DataFrames without 'output' in their names
    if 'output' not in variable_name:
        # Use Plotly to create the boxplot and scatter plot
        desired_row_index = df.index[df['mz'] == selected_mz_value].tolist()

        if len(desired_row_index) == 1:
            desired_row_index = desired_row_index[0] + 1

            row_data = df.iloc[desired_row_index - 1]
            case_columns = row_data.filter(like='_Case').tolist()
            control_columns = row_data.filter(like='_Control').tolist()

            # Create a subplot with 3D scatter plot
            fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'scatter3d'}, {'type': 'box'}]])

            # Add scatter plot
            fig.add_trace(go.Scatter3d(x=[1] * len(case_columns), y=case_columns, mode='markers', marker=dict(color='red'), name='Tumor'))
            fig.add_trace(go.Scatter3d(x=[2] * len(control_columns), y=control_columns, mode='markers', marker=dict(color='green'), name='Normal'))

            # Add box plot
            fig.add_trace(go.Box(y=case_columns, name='Tumor', boxpoints='all', jitter=0.3, pointpos=-1.8))
            fig.add_trace(go.Box(y=control_columns, name='Normal', boxpoints='all', jitter=0.3, pointpos=-0.2))

            # Update layout
            fig.update_layout(scene=dict(xaxis_title=variable_name, yaxis_title='Relative Abundance', zaxis_title=f'm/z={selected_mz_value:.4f}'),
                              boxmode='group', title_text=f'{variable_name}')

            # Save as HTML
            fig.write_html(f'plotly_{variable_name}.html')

            print(f"Plot done for DataFrame '{variable_name}'")

        else:
            print(f"No unique row found for the selected 'mz' value in DataFrame '{variable_name}'.")


Plot done for DataFrame 'asceding'
Plot done for DataFrame 'cecum'
Plot done for DataFrame 'descending'
Plot done for DataFrame 'Rectosigmoid'
Plot done for DataFrame 'rectum'
Plot done for DataFrame 'sigmoid'
Plot done for DataFrame 'transvrse'
