In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

input_dir = r'C:\Users\hp\Desktop\bmsis-ysp\Elemental Composition assignments (.csv)'
output_base_dir = r'C:\Users\hp\Desktop\bmsis-ysp\Task6_HC,OC,NC,ON_vs _mz\plots'

# Get a list of all CSV files in the input directory
csv_files = [f for f in os.listdir(input_dir) if f.endswith('.csv')]


In [1]:
import pandas as pd


def process_data(csv_file_path):
    data = []
    max_extra_columns = 0
    fixed_columns = ["Average Noise", "Exp. m/z", "Recal m/z", "Theor. Mass", "Error",
                     "Rel. Abundance", "Signal2Noise", "DBE", "H/C", "O/C", "Molecular Formula"]

    with open(csv_file_path, 'r') as file:
        for _ in range(2):
            next(file)

        for line in file:
            row = line.strip().split(',')
            fixed_part = row[:len(fixed_columns)]
            extra_part = row[len(fixed_columns):]
            max_extra_columns = max(max_extra_columns, len(extra_part))
            data.append(fixed_part + extra_part)

    extra_columns = [f'Extra Col {i+1}' for i in range(max_extra_columns)]
    all_columns = fixed_columns + extra_columns

    df = pd.DataFrame(data, columns=all_columns)

    return df


def preprocess_data(csv_file_path):
    df = process_data(csv_file_path)
    start_idx = df.columns.get_loc("Molecular Formula")
    cols_to_merge = df.columns[start_idx:]
    df['Molecular Formula'] = df[cols_to_merge].apply(
        lambda x: ' '.join(x.dropna().astype(str)), axis=1)
    df = df.drop(columns=cols_to_merge.difference(['Molecular Formula']))
    df = df.dropna()

    return df


def extract_element_count(formula, element):
    elements = formula.split()
    count = 0
    for i in range(0, len(elements), 2):
        if elements[i] == element:
            count = int(elements[i+1])
            break
    return count


def further_ratios(df):
    df['N/C'] = df['Molecular Formula'].apply(lambda x: extract_element_count(
        x, 'N') / extract_element_count(x, 'C') if extract_element_count(x, 'C') != 0 else 0)
    df['O/N'] = df['Molecular Formula'].apply(lambda x: extract_element_count(
        x, 'O') / extract_element_count(x, 'N') if extract_element_count(x, 'N') != 0 else 0)

    return df


In [2]:
def scatter_plots(df, output_dir):
    fig, axs = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Elemental Ratios vs. m/z', fontweight='bold', fontsize=16)

    # H/C vs. m/z
    axs[0, 0].scatter(df["Recal m/z"], df["H/C"], color='blue')
    axs[0, 0].set_xlabel("Recal m/z", fontweight='bold', fontsize=14)
    axs[0, 0].set_ylabel("H/C", fontweight='bold', fontsize=14)
    axs[0, 0].set_title("H/C vs m/z", fontweight='bold', fontsize=16)
    num_ticks = 6
    axs[0, 0].xaxis.set_major_locator(plt.MaxNLocator(num_ticks))
    axs[0, 0].yaxis.set_major_locator(plt.MaxNLocator(num_ticks))

    # O/C vs. m/z
    axs[0, 1].scatter(df["Recal m/z"], df["O/C"], color='green')
    axs[0, 1].set_xlabel("Recal m/z", fontweight='bold', fontsize=14)
    axs[0, 1].set_ylabel("O/C", fontweight='bold', fontsize=14)
    axs[0, 1].set_title("O/C vs m/z", fontweight='bold', fontsize=16)
    axs[0, 1].xaxis.set_major_locator(plt.MaxNLocator(num_ticks))
    axs[0, 1].yaxis.set_major_locator(plt.MaxNLocator(num_ticks))

    # N/C vs. m/z
    axs[1, 0].scatter(df["Recal m/z"], df["N/C"], color='red')
    axs[1, 0].set_xlabel("Recal m/z", fontweight='bold', fontsize=14)
    axs[1, 0].set_ylabel("N/C", fontweight='bold', fontsize=14)
    axs[1, 0].set_title("N/C vs m/z", fontweight='bold', fontsize=16)
    axs[1, 0].xaxis.set_major_locator(plt.MaxNLocator(num_ticks))
    axs[1, 0].yaxis.set_major_locator(plt.MaxNLocator(num_ticks))

    # O/N vs. m/z
    axs[1, 1].scatter(df["Recal m/z"], df["O/N"], color='purple')
    axs[1, 1].set_xlabel("Recal m/z", fontweight='bold', fontsize=14)
    axs[1, 1].set_ylabel("O/N", fontweight='bold', fontsize=14)
    axs[1, 1].set_title("O/N vs m/z", fontweight='bold', fontsize=16)
    axs[1, 1].xaxis.set_major_locator(plt.MaxNLocator(num_ticks))
    axs[1, 1].yaxis.set_major_locator(plt.MaxNLocator(num_ticks))

    plt.subplots_adjust(hspace=0.8)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])

    # Save the figure
    plt.savefig(os.path.join(output_dir, 'scatter_plot.png'))
    plt.close()

In [3]:
def heatmaps(df, output_dir):
    mz_values = df["Recal m/z"]
    hc_ratios = df["H/C"]
    oc_ratios = df["O/C"]
    nc_ratios = df["N/C"]
    on_ratios = df["O/N"]

    fig, axs = plt.subplots(2, 2, figsize=(20, 10), sharex=True)

    # H/C vs. m/z Heatmap
    hb1 = axs[0, 0].hexbin(mz_values, hc_ratios, gridsize=50,
                           cmap='Blues', edgecolors='none')
    axs[0, 0].set_title('H/C vs. m/z Heatmap', fontsize=12, fontweight='bold')
    axs[0, 0].set_xlabel('Recal m/z', fontsize=10, fontweight='bold')
    axs[0, 0].set_ylabel('H/C Ratio', fontsize=10, fontweight='bold')
    cb1 = fig.colorbar(hb1, ax=axs[0, 0])
    cb1.set_label('Density')

    # O/C vs. m/z Heatmap
    hb2 = axs[0, 1].hexbin(mz_values, oc_ratios, gridsize=50,
                           cmap='Greens', edgecolors='none')
    axs[0, 1].set_title('O/C vs. m/z Heatmap', fontsize=12, fontweight='bold')
    axs[0, 1].set_xlabel('Recal m/z', fontsize=10, fontweight='bold')
    axs[0, 1].set_ylabel('O/C Ratio', fontsize=10, fontweight='bold')
    cb2 = fig.colorbar(hb2, ax=axs[0, 1])
    cb2.set_label('Density')

    # N/C vs. m/z Heatmap
    hb3 = axs[1, 0].hexbin(mz_values, nc_ratios,
                           gridsize=50, cmap='Reds', edgecolors='none')
    axs[1, 0].set_title('N/C vs. m/z Heatmap', fontsize=12, fontweight='bold')
    axs[1, 0].set_xlabel('Recal m/z', fontsize=10, fontweight='bold')
    axs[1, 0].set_ylabel('N/C Ratio', fontsize=10, fontweight='bold')
    cb3 = fig.colorbar(hb3, ax=axs[1, 0])
    cb3.set_label('Density')

    # O/N vs. m/z Heatmap
    hb4 = axs[1, 1].hexbin(mz_values, on_ratios, gridsize=50,
                           cmap='Purples', edgecolors='none')
    axs[1, 1].set_title('O/N vs. m/z Heatmap', fontsize=12, fontweight='bold')
    axs[1, 1].set_xlabel('Recal m/z', fontsize=10, fontweight='bold')
    axs[1, 1].set_ylabel('O/N Ratio', fontsize=10, fontweight='bold')
    cb4 = fig.colorbar(hb4, ax=axs[1, 1])
    cb4.set_label('Density')

    plt.tight_layout()

    # Save the figure
    plt.savefig(os.path.join(output_dir, 'heatmap.png'))
    plt.close()


def sampled_heatmaps(df, output_dir):
    subset_df = df.sample(frac=0.1)  # Sample 10% of the data

    mz_values = subset_df["Recal m/z"]
    hc_ratios = subset_df["H/C"]
    oc_ratios = subset_df["O/C"]
    nc_ratios = subset_df["N/C"]
    on_ratios = subset_df["O/N"]

    fig, axs = plt.subplots(2, 2, figsize=(20, 12), sharex=True)

    # H/C vs. m/z Heatmap
    hb1 = axs[0, 0].hexbin(mz_values, hc_ratios, gridsize=20,
                           cmap='Blues', edgecolors='none')
    axs[0, 0].set_title('H/C vs. m/z Heatmap', fontsize=12, fontweight='bold')
    axs[0, 0].set_xlabel('Recal m/z', fontsize=10, fontweight='bold')
    axs[0, 0].set_ylabel('H/C Ratio', fontsize=10, fontweight='bold')
    cb1 = fig.colorbar(hb1, ax=axs[0, 0])
    cb1.set_label('Density')

    # O/C vs. m/z Heatmap
    hb2 = axs[0, 1].hexbin(mz_values, oc_ratios, gridsize=20,
                           cmap='Greens', edgecolors='none')
    axs[0, 1].set_title('O/C vs. m/z Heatmap', fontsize=12, fontweight='bold')
    axs[0, 1].set_xlabel('Recal m/z', fontsize=10, fontweight='bold')
    axs[0, 1].set_ylabel('O/C Ratio', fontsize=10, fontweight='bold')
    cb2 = fig.colorbar(hb2, ax=axs[0, 1])
    cb2.set_label('Density')

    # N/C vs. m/z Heatmap
    hb3 = axs[1, 0].hexbin(mz_values, nc_ratios,
                           gridsize=20, cmap='Reds', edgecolors='none')
    axs[1, 0].set_title('N/C vs. m/z Heatmap', fontsize=12, fontweight='bold')
    axs[1, 0].set_xlabel('Recal m/z', fontsize=10, fontweight='bold')
    axs[1, 0].set_ylabel('N/C Ratio', fontsize=10, fontweight='bold')
    cb3 = fig.colorbar(hb3, ax=axs[1, 0])
    cb3.set_label('Density')

    # O/N vs. m/z Heatmap
    hb4 = axs[1, 1].hexbin(mz_values, on_ratios, gridsize=20,
                           cmap='Purples', edgecolors='none')
    axs[1, 1].set_title('O/N vs. m/z Heatmap', fontsize=12, fontweight='bold')
    axs[1, 1].set_xlabel('Recal m/z', fontsize=10, fontweight='bold')
    axs[1, 1].set_ylabel('O/N Ratio', fontsize=10, fontweight='bold')
    cb4 = fig.colorbar(hb4, ax=axs[1, 1])
    cb4.set_label('Density')

    plt.tight_layout()

    # Save the figure
    plt.savefig(os.path.join(output_dir, 'sampled_heatmap.png'))
    plt.close()

In [None]:
for csv_file in csv_files:
    # Create full file path for the input CSV
    csv_file_path = os.path.join(input_dir, csv_file)

    # Create an output directory based on the CSV file name (without the .csv extension)
    output_dir = os.path.join(output_base_dir, os.path.splitext(csv_file)[0])
    os.makedirs(output_dir, exist_ok=True)

    # Preprocess and calculate further ratios
    df = preprocess_data(csv_file_path)
    df = further_ratios(df)

    # Generate and save plots
    scatter_plots(df, output_dir)
    heatmaps(df, output_dir)
    sampled_heatmaps(df, output_dir)

    # Print confirmation message
    print(f"{csv_file} Plots Stored.")