In [1]:
! pip install numpy
! pip install scipy
! pip install pandas
! pip install matplotlib
! pip install scikit-learn
! pip install openpyxl



In [2]:
import numpy as np
import pandas as pd
from scipy.signal import savgol_filter
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
import os
from openpyxl import load_workbook

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
os.chdir('/content/drive/My Drive/jj/projects/2025/sidraAjmal/data')
! ls

example_with_positive_cases_edited.xlsx   fluorescence_results_20250321_040344
fluorescence_results_20250320_211046	  fluorescence_results_20250321_040344.txt
fluorescence_results_20250320_211046.txt  low_TXA_conc_modified.xlsx
fluorescence_results_20250320_231331	  low_TXA_conc.xlsx
fluorescence_results_20250320_231331.txt


In [None]:
def detect_peaks_comprehensive(data, window_size=5, min_prominence=0.08, smoothed=None):
    """
    Comprehensive peak detection algorithm for fluorescence data

    Parameters:
    -----------
    data : list or array
        The fluorescence intensity data
    window_size : int
        Size of the window to check for local maxima
    min_prominence : float
        Minimum relative prominence required (as a fraction of data range)
    smoothed : array, optional
        Pre-computed smoothed data (if available)

    Returns:
    --------
    peaks : list
        Indices of detected peaks
    smoothed : array
        Smoothed data (if provided or calculated)
    peak_info : list
        Detailed information about each peak
    """
    import numpy as np
    from scipy.signal import savgol_filter, find_peaks

    # Apply smoothing if not already provided
    if smoothed is None:
        # Ensure window size is odd and not larger than the data
        if len(data) < window_size * 2:
            window_size = max(3, len(data) // 2)
            if window_size % 2 == 0:
                window_size -= 1

        poly_order = min(2, window_size - 1)
        smoothed = savgol_filter(data, window_length=window_size, polyorder=poly_order)

    # Calculate residuals (difference between actual and smoothed)
    residuals = np.array(data) - smoothed

    # Method 1: Direct peak finding using scipy's find_peaks (works well for clear peaks)
    data_range = np.max(data) - np.min(data)
    height_threshold = np.min(data) + (data_range * 0.05)  # 5% above minimum
    distance = max(2, len(data) // 20)  # At least 2, or 1/20th of the data length

    # Find peaks with scipy's algorithm
    peaks_scipy, properties = find_peaks(
        data,
        height=height_threshold,
        distance=distance,
        prominence=(data_range * min_prominence)
    )

    # Method 2: Find peaks based on local maxima with prominence calculation
    half_window = window_size // 2
    local_maxima = []

    # Find all local maxima
    for i in range(half_window, len(data) - half_window):
        window = data[i - half_window:i + half_window + 1]
        if data[i] == max(window):
            # Calculate prominence
            # Look backward for either a higher point or the lowest point before a higher one
            left_min = data[i]
            for j in range(i - 1, -1, -1):
                if data[j] > data[i]:
                    break
                left_min = min(left_min, data[j])

            # Look forward for either a higher point or the lowest point before a higher one
            right_min = data[i]
            for j in range(i + 1, len(data)):
                if data[j] > data[i]:
                    break
                right_min = min(right_min, data[j])

            baseline = max(left_min, right_min)
            prominence = data[i] - baseline
            rel_prominence = prominence / data_range

            # Add to list if it meets prominence threshold
            if rel_prominence >= min_prominence:
                local_maxima.append({
                    'index': i,
                    'value': data[i],
                    'prominence': prominence,
                    'rel_prominence': rel_prominence
                })

    # Combine both methods and remove duplicates
    peaks_from_local = [p['index'] for p in local_maxima]
    all_peaks = list(set(list(peaks_scipy) + peaks_from_local))
    all_peaks.sort()

    # Deduplicate peaks that are very close to each other
    if len(all_peaks) > 1:
        filtered_peaks = [all_peaks[0]]
        for i in range(1, len(all_peaks)):
            if all_peaks[i] - filtered_peaks[-1] > distance:
                filtered_peaks.append(all_peaks[i])
        all_peaks = filtered_peaks

    # Generate detailed info for each peak
    peak_info = []
    for peak_idx in all_peaks:
        # Re-calculate prominence for all final peaks
        left_min = data[peak_idx]
        for j in range(peak_idx - 1, -1, -1):
            if data[j] > data[peak_idx]:
                break
            left_min = min(left_min, data[j])

        right_min = data[peak_idx]
        for j in range(peak_idx + 1, len(data)):
            if data[j] > data[peak_idx]:
                break
            right_min = min(right_min, data[j])

        baseline = max(left_min, right_min)
        prominence = data[peak_idx] - baseline
        rel_prominence = prominence / data_range
        height_above_smoothed = data[peak_idx] - smoothed[peak_idx]

        peak_info.append({
            'index': peak_idx,
            'value': data[peak_idx],
            'prominence': prominence,
            'rel_prominence': rel_prominence,
            'height_above_smoothed': height_above_smoothed
        })

    return all_peaks, smoothed, peak_info


def process_excel_file_comprehensive(file_path, output_dir=None, debug_mode=True,
                                  window_size=5, min_prominence=0.08):
    """
    Process Excel file with fluorescence data using comprehensive peak detection

    Parameters:
    -----------
    file_path : str
        Path to the Excel file
    output_dir : str
        Directory to save results, default is fluorescence_results_YYYYMMDD_HHMMSS
    debug_mode : bool
        If True, save plots for all analyzed pairs
    window_size : int
        Size of the window to check for local maxima (should be odd)
    min_prominence : float
        Minimum relative prominence required (as a fraction of data range)
    """
    import os
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from openpyxl import load_workbook
    from scipy.signal import savgol_filter
    import datetime
    import sys

    # Create a timestamp for the output directory if not provided
    if output_dir is None:
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        output_dir = f"fluorescence_results_{timestamp}"

    # Create output directory
    os.makedirs(output_dir, exist_ok=True)

    # Set up logging to file with the same name as the directory
    log_file = f"{output_dir}.txt"
    original_stdout = sys.stdout
    log_handle = open(log_file, 'w')
    sys.stdout = log_handle

    # Load the workbook
    print(f"Loading workbook: {file_path}")
    wb = load_workbook(file_path, data_only=True)

    results_summary = []

    # Process each sheet
    print(f"COMPREHENSIVE PEAK DETECTION: Using window_size={window_size}, min_prominence={min_prominence}")
    print(f"Available sheets: {wb.sheetnames}")

    for sheet_name in wb.sheetnames:
        print(f"\nProcessing sheet: {sheet_name}")
        sheet = wb[sheet_name]

        # Get basic sheet dimensions
        max_row = sheet.max_row
        max_col = sheet.max_column
        print(f"Sheet has {max_row} rows and {max_col} columns")

        # First, check if this is a data sheet by looking for "Site" in cell B1
        if sheet['B1'].value != 'Site':
            print(f"Skipping {sheet_name} - doesn't match expected format")
            continue

        # Fix headers: Move headers starting at C1 two cells to the right
        max_col = sheet.max_column
        for col in range(max_col, 2, -1):
            if col >= 5:  # Only move columns C, D, E... (3, 4, 5...)
                header_value = sheet.cell(row=1, column=col-2).value
                sheet.cell(row=1, column=col).value = header_value

        # Clear the old header positions
        sheet.cell(row=1, column=3).value = None
        sheet.cell(row=1, column=4).value = None

        # Process data in groups of three rows
        row = 2  # Start at row 2
        max_row = sheet.max_row

        while row <= max_row - 2:  # Ensure we have at least 3 rows to process
            # Check if we have a Cy5-Cy5-Cy3 pattern
            cell_b1 = sheet.cell(row=row, column=2).value
            cell_b2 = sheet.cell(row=row+1, column=2).value
            cell_b3 = sheet.cell(row=row+2, column=2).value

        if cell_b1 == "Cy5" and cell_b2 == "Cy5" and cell_b3 == "Cy3":
                # Use the second Cy5 row as mentioned in the requirements
                cy5_row = row + 1
                cy3_row = row + 2

        # Extract data for Cy5 (skip the first 4 columns: A, B, C, D)
        cy5_data = []
        for col in range(5, max_col + 1):
          value = sheet.cell(row=cy5_row, column=col).value
          if isinstance(value, (int, float)):
            cy5_data.append(value)

        # Extract data for Cy3
        cy3_data = []
        for col in range(5, max_col + 1):
          value = sheet.cell(row=cy3_row, column=col).value
          if isinstance(value, (int, float)):
            cy5_data.append(value)


        # Make sure we have data
        if len(cy5_data) > 0 and len(cy3_data) > 0:
            # Get site information
            site_id = sheet.cell(row=cy5_row, column=1).value
            site_number = sheet.cell(row=cy5_row, column=2).value

            # Use our comprehensive peak detection function
            cy5_peaks, cy5_smoothed, cy5_peak_info = detect_peaks_comprehensive(
                cy5_data, window_size=window_size, min_prominence=min_prominence
            )
            cy3_peaks, cy3_smoothed, cy3_peak_info = detect_peaks_comprehensive(
                cy3_data, window_size=window_size, min_prominence=min_prominence
            )

            # Check if this is a positive match (Cy5 has peaks, Cy3 doesn't)
            match_found = (len(cy5_peaks) > 0 and len(cy3_peaks) == 0)

            # Log detailed peak information for debugging
            if match_found or debug_mode:
                print(f"Row {cy5_row}: Found {len(cy5_peaks)} peaks")
                for i, peak_info in enumerate(cy5_peak_info):
                    print(f"  Peak {i+1}: Position {peak_info['index']+1}, "
                          f"Value {peak_info['value']:.2f}, "
                          f"Prominence {peak_info['prominence']:.2f} "
                          f"({peak_info['rel_prominence']*100:.2f}%)")

                print(f"Row {cy3_row}: Found {len(cy3_peaks)} peaks")
                for i, peak_info in enumerate(cy3_peak_info):
                    print(f"  Peak {i+1}: Position {peak_info['index']+1}, "
                          f"Value {peak_info['value']:.2f}, "
                          f"Prominence {peak_info['prominence']:.2f} "
                          f"({peak_info['rel_prominence']*100:.2f}%)")

            if match_found:
                valid_groups += 1
                print(f"THIS IS A POSITIVE MATCH! Cy5 has {len(cy5_peaks)} peaks, Cy3 has none.")
            elif debug_mode:
                print(f"Not a match - {'Cy5 has no peaks' if len(cy5_peaks) == 0 else 'Cy3 has peaks too'}")

            # Check whether to visualize this data pair
            should_visualize = match_found  # Always visualize positive matches
            if debug_mode:                  # In debug mode, visualize all pairs
                should_visualize = True

            if should_visualize:
                # Create visualization
                fig, axs = plt.subplots(2, 1, figsize=(12, 10))

                # Plot Cy5 data
                axs[0].plot(cy5_data, 'g-', label=f'Cy5 Data (Row {cy5_row})')
                axs[0].plot(cy5_smoothed, 'r--', alpha=0.7, label='Smoothed Trend')
                if cy5_peaks:
                    axs[0].scatter([p for p in cy5_peaks], [cy5_data[i] for i in cy5_peaks],
                                color='green', marker='o', s=100, label='Detected Peaks')

                    # Annotate peaks with their position and prominence
                    for i, peak_idx in enumerate(cy5_peaks):
                        info = next(info for info in cy5_peak_info if info['index'] == peak_idx)
                        axs[0].annotate(f"P{i+1}: {peak_idx+1}",
                                      xy=(peak_idx, cy5_data[peak_idx]),
                                      xytext=(0, 10),
                                      textcoords='offset points',
                                      fontsize=9)

                axs[0].set_title(f'Sheet: {sheet_name}, Cy5 Fluorescence (Row {cy5_row})')
                axs[0].legend()
                axs[0].grid(True)

                # Plot Cy3 data
                axs[1].plot(cy3_data, 'b-', label=f'Cy3 Data (Row {cy3_row})')
                axs[1].plot(cy3_smoothed, 'r--', alpha=0.7, label='Smoothed Trend')
                if cy3_peaks:
                    axs[1].scatter([p for p in cy3_peaks], [cy3_data[i] for i in cy3_peaks],
                                color='red', marker='o', s=100, label='Detected Peaks')

                    # Annotate peaks with their position and prominence
                    for i, peak_idx in enumerate(cy3_peaks):
                        info = next(info for info in cy3_peak_info if info['index'] == peak_idx)
                        axs[1].annotate(f"P{i+1}: {peak_idx+1}",
                                      xy=(peak_idx, cy3_data[peak_idx]),
                                      xytext=(0, 10),
                                      textcoords='offset points',
                                      fontsize=9)

                axs[1].set_title(f'Cy3 Fluorescence (Row {cy3_row})')
                axs[1].legend()
                axs[1].grid(True)

                # Add detailed analysis information if in debug mode
                if debug_mode:
                    fig.set_size_inches(12, 15)
                    plt.subplots_adjust(hspace=0.4)

                    # Add more detailed text about the peaks
                    fig.text(0.1, 0.01,
                            f"Cy5 Peaks: {len(cy5_peaks)}\n" +
                            "\n".join([f"Peak at {p+1}: Value={cy5_data[p]:.1f}, " +
                                      f"Prominence={next(i['prominence'] for i in cy5_peak_info if i['index'] == p):.1f} " +
                                      f"({next(i['rel_prominence']*100 for i in cy5_peak_info if i['index'] == p):.1f}%)"
                                      for p in cy5_peaks]),
                            fontsize=9, verticalalignment='bottom')

                plt.tight_layout()

                # Add debug indicator to filename if not a positive match
                if match_found:
                    plot_filename = f"{output_dir}/{sheet_name}_Rows{cy5_row}-{cy3_row}.png"
                else:
                    plot_filename = f"{output_dir}/DEBUG_{sheet_name}_Rows{cy5_row}-{cy3_row}.png"

                plt.savefig(plot_filename)
                plt.close(fig)

                # Add to results summary if it's a positive match
                if match_found:
                    # Format peak information for the summary
                    peak_positions = [p+1 for p in cy5_peaks]
                    peak_heights = [info['height_above_smoothed'] for info in cy5_peak_info]
                    peak_prominences = [info['prominence'] for info in cy5_peak_info]
                    relative_heights = [h / np.mean(cy5_data) * 100 for h in peak_heights]

                    results_summary.append({
                        'Sheet': sheet_name,
                        'Rows': f"{cy5_row}-{cy3_row}",
                        'Cy5 Peaks': len(cy5_peaks),
                        'Cy5 Peak Positions': peak_positions,
                        'Cy5 Peak Heights': [f"{h:.2f}" for h in peak_heights],
                        'Cy5 Prominences': [f"{p:.2f}" for p in peak_prominences],
                        'Cy5 Relative Heights (%)': [f"{h:.2f}%" for h in relative_heights],
                        'Plot': plot_filename
                    })

            # Move to the next pair of rows
            row += 2

        print(f"Processed {processed_groups} groups, {valid_groups} had Cy5 peaks and no Cy3 peaks in this sheet")

    # Create summary report
    if results_summary:
        summary_df = pd.DataFrame(results_summary)
        summary_df.to_csv(f"{output_dir}/peak_detection_summary.csv", index=False)
        print(f"\nSummary report saved to {output_dir}/peak_detection_summary.csv")
        print(f"Found {len(results_summary)} instances where Cy5 has peaks and Cy3 has none")

        print("\nPositive matches found:")
        for i, result in enumerate(results_summary, 1):
            print(f"{i}. Sheet: {result['Sheet']}, Rows: {result['Rows']}")
            print(f"   Cy5 has {result['Cy5 Peaks']} peaks at positions: {result['Cy5 Peak Positions']}")
            print(f"   Peak heights: {result['Cy5 Peak Heights']}")
            print(f"   Peak prominences: {result['Cy5 Prominences']}")
            print(f"   Relative to mean signal: {result['Cy5 Relative Heights (%)']}")
    else:
        print("\nNo matching patterns found in the data (Cy5 with peaks and Cy3 without)")

    # Close the log file and restore stdout
    sys.stdout = original_stdout
    log_handle.close()

    print(f"Results saved to directory: {output_dir}")
    print(f"Log file saved to: {log_file}")

    return results_summary

# Example usage
if __name__ == "__main__":
    import datetime

    file_path = "example_with_positive_cases_edited.xlsx"

    # Generate timestamp for output directory
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = f"fluorescence_results_{timestamp}"

    # Process with comprehensive peak detection
    results = process_excel_file_comprehensive(
        file_path,
        output_dir=output_dir,
        debug_mode=True,  # Set to True to see detailed analysis
        window_size=5,    # Size of window for local maxima detection
        min_prominence=0.08  # Minimum peak prominence as fraction of data range
    )