In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import mplcursors
import os
import numpy as np
from tqdm import tqdm

def plot_combined_op_histogram(df, output_dir):
    fig, ax = plt.subplots(figsize=(20, 10))
    
    # Create the histogram
    n, bins, patches = ax.hist(df['s2calc'], bins=100, edgecolor='black', alpha=0.7)
    
    # Set up colormap
    cmap = plt.cm.viridis
    norm = plt.Normalize(vmin=df['s2calc'].min(), vmax=df['s2calc'].max())
    
    # Color the bars based on B-factor values
    for bin, patch in zip(bins, patches):
        color = cmap(norm(bin))
        patch.set_facecolor(color)
    
    # Create a ScalarMappable for the colorbar
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
    sm.set_array([])
    
    # Add colorbar to the figure
    cbar = fig.colorbar(sm, ax=ax, label='s2calc')
    
    ax.set_title(f'Combined OP Histogram for Apo 5.0 Closeres')
    ax.set_xlabel('s2calc')
    ax.set_ylabel('Frequency')
    ax.grid(True, linestyle='--', alpha=0.3)
    
    plt.tight_layout()
    
    # Add hover annotations
    cursor = mplcursors.cursor(patches, hover=True)
    cursor.connect("add", lambda sel: sel.annotation.set_text(
        f's2calc range: {bins[sel.target.index]:.2f} - {bins[sel.target.index+1]:.2f}\n'
        f'Count: {n[sel.target.index]}'
    ))
    
    # Save the figure
    output_file = os.path.join(output_dir, 'combined_5_closeres_op_histogram.png')
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close(fig)  # Close the figure to free up memory

    print(f"Saved combined op histogram to {output_file}")

def process_all_proteins(file_path, output_dir):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Read the file
    df = pd.read_csv(file_path)
    
    # Plot the combined B-factor histogram
    plot_combined_op_histogram(df, output_dir)
    
    print("Combined OP histogram created successfully!")

# Specify the path to your input file and output directory
file_path = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_5_op_subset_aligned.csv'
output_dir = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_5_op_histogram_combined'

# Process all proteins in the file
process_all_proteins(file_path, output_dir)

Saved combined op histogram to /Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_5_op_histogram_combined\combined_5_closeres_op_histogram.png
Combined OP histogram created successfully!


In [4]:
# How to convert all .out files from desktop to .csv
import os

# Directory containing the .out files (Desktop directory)
directory = "/Users/harrw10/Desktop/op_data"
# Loop through all .out files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".out"):
        # Read the content of the .out file
        with open(os.path.join(directory, filename), 'r') as infile:
            content = infile.read()
        
        # Define the output .csv file name
        csv_filename = filename.replace(".out", ".csv")
        
        # Write the content to the .csv file
        with open(os.path.join(directory, csv_filename), 'w') as outfile:
            outfile.write(content)

print("Conversion complete!")

Conversion complete!


In [8]:
#Adding PDB name to a column in each of the op files then created a single combined file
import os
import pandas as pd

# Directory containing the .csv files
directory = '/Users/harrw10/Desktop/op_data'

# Initialize an empty list to hold the dataframes
dataframes = []

# List all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Extract the base name without extension and remove '_OP'
        base_name, ext = os.path.splitext(filename)
        cleaned_base_name = base_name.replace('_OP', '')
        
        # Read the csv file into a dataframe
        df = pd.read_csv(os.path.join(directory, filename))
        
        # Add a new column with the cleaned base name
        df['Cleaned Base Name'] = cleaned_base_name
        
        # Append the dataframe to the list
        dataframes.append(df)

# Concatenate all dataframes in the list into a single dataframe
combined_op_df = pd.concat(dataframes, ignore_index=True)

# Save the combined dataframe to a new csv file
combined_op_df.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/op_subset_files/op_combined_2.csv', index=False)

print("The combined dataset has been successfully saved to 'combined_dataset.csv'.")

The combined dataset has been successfully saved to 'combined_dataset.csv'.


In [14]:
#How to combine aspects of the 'Apo' data and OP_subset into a singular file
import pandas as pd
combined_op_df = pd.read_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/op_subset_files/op_combined_2.csv')
df = pd.read_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/apo_holo_241016.csv')
# Define criteria for apo_res structure
def apo_df():
    filtered_2_df = df[df['Apo'].notna() & (df['Apo'] != '')]
# Select only the desired columns
    apo_columns = ['Apo']
    filtered_2_df = filtered_2_df[apo_columns]
    return filtered_2_df
apo_data = apo_df()

apo_op_subset_aligned = combined_op_df[combined_op_df['Cleaned Base Name'] .isin (apo_data['Apo'])]
apo_op_subset_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_op_subset_aligned.csv',index=False)
print(apo_op_subset_aligned.head(100000))

          s2calc   s2ortho     s2ang resn  resi chain Cleaned Base Name
0       0.900755  0.900755  1.000000  TYR     3     A              16gs
1       0.869301  0.869301  1.000000  THR     4     A              16gs
2       0.920219  0.920219  1.000000  VAL     5     A              16gs
3       0.861086  0.861086  1.000000  VAL     6     A              16gs
4       0.878256  0.878256  1.000000  TYR     7     A              16gs
...          ...       ...       ...  ...   ...   ...               ...
137647  0.851186  0.851186  1.000000  ILE   696     B              6o0r
137648  0.702544  0.718552  0.977722  ARG   697     B              6o0r
137649  0.852310  0.852310  1.000000  PHE   698     B              6o0r
137650  0.769810  0.769810  1.000000  LEU   699     B              6o0r
137651  0.328875  0.328875  1.000000  GLN   700     B              6o0r

[54292 rows x 7 columns]


In [16]:
import matplotlib.pyplot as plt
import pandas as pd
import mplcursors
import os
import numpy as np
from tqdm import tqdm

def plot_combined_op_histogram(df, output_dir):
    fig, ax = plt.subplots(figsize=(20, 10))
    
    # Create the histogram
    n, bins, patches = ax.hist(df['s2calc'], bins=100, edgecolor='black', alpha=0.7)
    
    # Set up colormap
    cmap = plt.cm.viridis
    norm = plt.Normalize(vmin=df['s2calc'].min(), vmax=df['s2calc'].max())
    
    # Color the bars based on B-factor values
    for bin, patch in zip(bins, patches):
        color = cmap(norm(bin))
        patch.set_facecolor(color)
    
    # Create a ScalarMappable for the colorbar
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
    sm.set_array([])
    
    # Add colorbar to the figure
    cbar = fig.colorbar(sm, ax=ax, label='s2calc')
    
    ax.set_title(f'Combined OP Histogram for Apo structures')
    ax.set_xlabel('s2calc')
    ax.set_ylabel('Frequency')
    ax.grid(True, linestyle='--', alpha=0.3)
    
    plt.tight_layout()
    
    # Add hover annotations
    cursor = mplcursors.cursor(patches, hover=True)
    cursor.connect("add", lambda sel: sel.annotation.set_text(
        f's2calc range: {bins[sel.target.index]:.2f} - {bins[sel.target.index+1]:.2f}\n'
        f'Count: {n[sel.target.index]}'
    ))
    
    # Save the figure
    output_file = os.path.join(output_dir, 'combined_op_histogram.png')
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close(fig)  # Close the figure to free up memory

    print(f"Saved combined op histogram to {output_file}")

def process_all_proteins(file_path, output_dir):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Read the file
    df = pd.read_csv(file_path)
    
    # Plot the combined B-factor histogram
    plot_combined_op_histogram(df, output_dir)
    
    print("Combined OP histogram created successfully!")

# Specify the path to your input file and output directory
file_path = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_op_subset_aligned.csv'
output_dir = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_op_histogram_combined'

# Process all proteins in the file
process_all_proteins(file_path, output_dir)

Saved combined op histogram to /Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_op_histogram_combined\combined_op_histogram.png
Combined OP histogram created successfully!


In [18]:
import matplotlib.pyplot as plt
import pandas as pd
import mplcursors
import os
import numpy as np
from tqdm import tqdm

def plot_overlaid_b_factor_histograms(df1, df2, label1, label2, output_dir):
    fig, ax = plt.subplots(figsize=(20, 10))
    
    # Create the histograms
    bins = np.linspace(min(df1['s2calc'].min(), df2['s2calc'].min()),
                       max(df1['s2calc'].max(), df2['s2calc'].max()),
                       100)
    
    n1, bins1, patches1 = ax.hist(df1['s2calc'], bins=bins, alpha=0.5, label=label1, color='blue')
    n2, bins2, patches2 = ax.hist(df2['s2calc'], bins=bins, alpha=0.5, label=label2, color='red')
    
    ax.set_title(f'Overlaid B-factor Histograms')
    ax.set_xlabel('S2calc')
    ax.set_ylabel('Frequency')
    ax.grid(True, linestyle='--', alpha=0.3)
    ax.legend()
    
    plt.tight_layout()
    
    # Add hover annotations
    cursor = mplcursors.cursor(patches1 + patches2, hover=True)
    
    @cursor.connect("add")
    def on_add(sel):
        if sel.artist in patches1:
            index = patches1.index(sel.artist)
            dataset = label1
            count = n1[index]
        else:
            index = patches2.index(sel.artist)
            dataset = label2
            count = n2[index]
        
        sel.annotation.set_text(
            f'Dataset: {dataset}\n'
            f'B-factor range: {bins[index]:.2f} - {bins[index+1]:.2f}\n'
            f'Count: {count}'
        )
    
    # Save the figure
    output_file = os.path.join(output_dir, 'overlaid_order_parameter_histograms.png')
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close(fig)  # Close the figure to free up memory

    print(f"Saved overlaid B-factor histograms to {output_file}")

def process_datasets(file_path1, file_path2, label1, label2, output_dir):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Read the files
    df1 = pd.read_csv(file_path1)
    df2 = pd.read_csv(file_path2)
    
    # Plot the overlaid B-factor histograms
    plot_overlaid_b_factor_histograms(df1, df2, label1, label2, output_dir)
    
    print("Overlaid Order Parameter histograms created successfully!")

# Specify the paths to your input files, labels, and output directory
file_path1 = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_op_subset_aligned.csv'
file_path2 = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_5_op_subset_aligned.csv'
label1 = 'Apo OP'
label2 = 'Apo OP 5.0 closeres'
output_dir = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_op_histograms_overlay'

# Process the datasets
process_datasets(file_path1, file_path2, label1, label2, output_dir)

Saved overlaid B-factor histograms to /Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_op_histograms_overlay\overlaid_order_parameter_histograms.png
Overlaid Order Parameter histograms created successfully!


In [20]:
import matplotlib.pyplot as plt
import pandas as pd
import mplcursors
import os
import numpy as np
from tqdm import tqdm

def plot_normalized_overlaid_b_factor_histograms(df1, df2, label1, label2, output_dir):
    fig, ax = plt.subplots(figsize=(20, 10))
    
    # Create the histograms
    bins = np.linspace(min(df1['s2calc'].min(), df2['s2calc'].min()),
                       max(df1['s2calc'].max(), df2['s2calc'].max()),
                       100)
    
    # Normalize the histograms
    weights1 = np.ones_like(df1['s2calc']) / len(df1['s2calc'])
    weights2 = np.ones_like(df2['s2calc']) / len(df2['s2calc'])
    
    n1, bins1, patches1 = ax.hist(df1['s2calc'], bins=bins, weights=weights1, 
                                  alpha=0.5, label=label1, color='blue')
    n2, bins2, patches2 = ax.hist(df2['s2calc'], bins=bins, weights=weights2, 
                                  alpha=0.5, label=label2, color='red')
    
    ax.set_title('Normalized Overlaid B-factor Histograms')
    ax.set_xlabel('s2calc')
    ax.set_ylabel('Probability Density')
    ax.grid(True, linestyle='--', alpha=0.3)
    ax.legend()
    
    plt.tight_layout()
    
    # Add hover annotations
    cursor = mplcursors.cursor(patches1 + patches2, hover=True)
    
    @cursor.connect("add")
    def on_add(sel):
        if sel.artist in patches1:
            index = patches1.index(sel.artist)
            dataset = label1
            density = n1[index]
        else:
            index = patches2.index(sel.artist)
            dataset = label2
            density = n2[index]
        
        sel.annotation.set_text(
            f'Dataset: {dataset}\n'
            f'B-factor range: {bins[index]:.2f} - {bins[index+1]:.2f}\n'
            f'Density: {density:.4f}'
        )
    
    # Save the figure
    output_file = os.path.join(output_dir, 'normalized_overlaid_order_parameter_histograms.png')
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close(fig)  # Close the figure to free up memory

    print(f"Saved normalized overlaid B-factor histograms to {output_file}")

def process_datasets(file_path1, file_path2, label1, label2, output_dir):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Read the files
    df1 = pd.read_csv(file_path1)
    df2 = pd.read_csv(file_path2)
    
    # Plot the normalized overlaid B-factor histograms
    plot_normalized_overlaid_b_factor_histograms(df1, df2, label1, label2, output_dir)
    
    print("Normalized overlaid Order Parameter histograms created successfully!")

# Specify the paths to your input files, labels, and output directory
file_path1 = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_op_subset_aligned.csv'
file_path2 = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_5_op_subset_aligned.csv'
label1 = 'Apo OP'
label2 = 'Apo OP 5.0 closeres'
output_dir = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_normalized_op_histograms_overlay'

# Process the datasets
process_datasets(file_path1, file_path2, label1, label2, output_dir)

Saved normalized overlaid B-factor histograms to /Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_normalized_op_histograms_overlay\normalized_overlaid_order_parameter_histograms.png
Normalized overlaid Order Parameter histograms created successfully!


In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import mplcursors
import os
import numpy as np
from tqdm import tqdm

def plot_zscore_normalized_overlaid_b_factor_histograms(df1, df2, label1, label2, output_dir):
    fig, ax = plt.subplots(figsize=(20, 10))
    
    # Z-score normalize the data
    df1_zscore = (df1['s2calc'] - df1['s2calc'].mean()) / df1['s2calc'].std()
    df2_zscore = (df2['s2calc'] - df2['s2calc'].mean()) / df2['s2calc'].std()
    
    # Create the histograms with z-scored data
    bins = np.linspace(min(df1_zscore.min(), df2_zscore.min()),
                      max(df1_zscore.max(), df2_zscore.max()),
                      100)
    
    n1, bins1, patches1 = ax.hist(df1_zscore, bins=bins, 
                                 alpha=0.5, label=label1, color='blue', density=True)
    n2, bins2, patches2 = ax.hist(df2_zscore, bins=bins, 
                                 alpha=0.5, label=label2, color='red', density=True)
    
    ax.set_title('Z-Score Normalized Order Parameter Histograms')
    ax.set_xlabel('s2calc (Z-score)')
    ax.set_ylabel('Density')
    ax.grid(True, linestyle='--', alpha=0.3)
    ax.legend()
    
    plt.tight_layout()
    
    # Add hover annotations
    cursor = mplcursors.cursor(patches1 + patches2, hover=True)
    
    @cursor.connect("add")
    def on_add(sel):
        if sel.artist in patches1:
            index = patches1.index(sel.artist)
            dataset = label1
            density = n1[index]
        else:
            index = patches2.index(sel.artist)
            dataset = label2
            density = n2[index]
        
        sel.annotation.set_text(
            f'Dataset: {dataset}\n'
            f'Z-score range: {bins[index]:.2f} - {bins[index+1]:.2f}\n'
            f'Density: {density:.4f}'
        )
    
    output_file = os.path.join(output_dir, 'zscore_normalized_order_parameter_histograms.png')
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close(fig)
    
    print(f"Saved Z-score normalized Order Parameter histograms to {output_file}")

def process_datasets(file_path1, file_path2, label1, label2, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    df1 = pd.read_csv(file_path1)
    df2 = pd.read_csv(file_path2)
    plot_zscore_normalized_overlaid_b_factor_histograms(df1, df2, label1, label2, output_dir)
    print("Z-score normalized Order Parameter histograms created successfully!")

# File paths and settings
file_path1 = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_op_subset_aligned.csv'
file_path2 = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_5_op_subset_aligned.csv'
label1 = 'Apo Order Parameter'
label2 = 'Apo Order Parameter 5.0 closeres'
output_dir = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_zscore_op_histograms_overlay'

# Process the datasets
process_datasets(file_path1, file_path2, label1, label2, output_dir)

Saved Z-score normalized Order Parameter histograms to /Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_zscore_op_histograms_overlay\zscore_normalized_order_parameter_histograms.png
Z-score normalized Order Parameter histograms created successfully!


In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import mplcursors
import os
import numpy as np
from tqdm import tqdm

def plot_zscore_normalized_overlaid_op_histograms(df1, df2, label1, label2, output_dir):
    fig, ax = plt.subplots(figsize=(20, 10))
    
    # Z-score normalize the data
    df1_zscore = (df1['s2calc'] - df1['s2calc'].mean()) / df1['s2calc'].std()
    df2_zscore = (df2['s2calc'] - df2['s2calc'].mean()) / df2['s2calc'].std()
    
    # Calculate percentages
    df1_greater_zero = (df1_zscore > 0).mean() * 100
    df1_less_zero = (df1_zscore < 0).mean() * 100
    df2_greater_zero = (df2_zscore > 0).mean() * 100
    df2_less_zero = (df2_zscore < 0).mean() * 100
    
    # Print the percentages
    print(f"\nPercentage Statistics:")
    print(f"{label1}:")
    print(f"Greater than zero: {df1_greater_zero:.2f}%")
    print(f"Less than zero: {df1_less_zero:.2f}%")
    print(f"\n{label2}:")
    print(f"Greater than zero: {df2_greater_zero:.2f}%")
    print(f"Less than zero: {df2_less_zero:.2f}%")
    
    # Create histograms (rest of your original plotting code)
    bins = np.linspace(min(df1_zscore.min(), df2_zscore.min()),
                      max(df1_zscore.max(), df2_zscore.max()),
                      100)
    
    n1, bins1, patches1 = ax.hist(df1_zscore, bins=bins, 
                                 alpha=0.5, label=f"{label1}", color='blue', density=True)
    n2, bins2, patches2 = ax.hist(df2_zscore, bins=bins, 
                                 alpha=0.5, label=f"{label2}", color='red', density=True)
    
    # Add percentage information to the plot title
    ax.set_title('Z-Score Normalized Order Parameter Histograms\n' +
                f'{label1}: {df1_greater_zero:.1f}% > 0, {df1_less_zero:.1f}% < 0\n' +
                f'{label2}: {df2_greater_zero:.1f}% > 0, {df2_less_zero:.1f}% < 0')
    ax.set_xlabel('s2calc (Z-score)')
    ax.set_ylabel('Density')
    ax.grid(True, linestyle='--', alpha=0.3)
    ax.legend()
    
    plt.tight_layout()
    
    # Add hover annotations
    cursor = mplcursors.cursor(patches1 + patches2, hover=True)
    
    @cursor.connect("add")
    def on_add(sel):
        if sel.artist in patches1:
            index = patches1.index(sel.artist)
            dataset = label1
            density = n1[index]
        else:
            index = patches2.index(sel.artist)
            dataset = label2
            density = n2[index]
        
        sel.annotation.set_text(
            f'Dataset: {dataset}\n'
            f'Z-score range: {bins[index]:.2f} - {bins[index+1]:.2f}\n'
            f'Density: {density:.4f}'
        )
    
    output_file = os.path.join(output_dir, 'zscore_normalized_order_parameter_histograms.png')
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close(fig)
    
    print(f"Saved Z-score normalized Order Parameter histograms to {output_file}")
def process_datasets(file_path1, file_path2, label1, label2, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    df1 = pd.read_csv(file_path1)
    df2 = pd.read_csv(file_path2)
    plot_zscore_normalized_overlaid_op_histograms(df1, df2, label1, label2, output_dir)
    print("Z-score normalized Order Parameter histograms created successfully!")

# File paths and settings
file_path1 = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_op_subset_aligned.csv'
file_path2 = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_5_op_subset_aligned.csv'
label1 = 'Apo Order Parameter'
label2 = 'Apo Order Parameter 5.0 closeres'
output_dir = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_zscore_op_histograms_overlay_with_auc'

# Process the datasets
process_datasets(file_path1, file_path2, label1, label2, output_dir)



Percentage Statistics:
Apo Order Parameter:
Greater than zero: 63.24%
Less than zero: 36.76%

Apo Order Parameter 5.0 closeres:
Greater than zero: 67.38%
Less than zero: 32.62%
Saved Z-score normalized Order Parameter histograms to /Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_zscore_op_histograms_overlay_with_auc\zscore_normalized_order_parameter_histograms.png
Z-score normalized Order Parameter histograms created successfully!


In [15]:
import matplotlib.pyplot as plt
import pandas as pd
import mplcursors
import os
import numpy as np
from tqdm import tqdm

def plot_overlaid_b_factor_histograms(df1, df2, label1, label2, output_dir):
    fig, ax = plt.subplots(figsize=(20, 10))
    
    # Create the histograms
    bins = np.linspace(min(df1['s2calc'].min(), df2['s2calc'].min()),
                       max(df1['s2calc'].max(), df2['s2calc'].max()),
                       100)
    
    n1, bins1, patches1 = ax.hist(df1['s2calc'], bins=bins, alpha=0.5, label=label1, color='blue', density=True)
    n2, bins2, patches2 = ax.hist(df2['s2calc'], bins=bins, alpha=0.5, label=label2, color='red', density=True)
    
    ax.set_title(f'Overlaid B-factor Histograms')
    ax.set_xlabel('S2calc')
    ax.set_ylabel('Density')
    ax.grid(True, linestyle='--', alpha=0.3)
    ax.legend()
    
    plt.tight_layout()
    
    # Add hover annotations
    cursor = mplcursors.cursor(patches1 + patches2, hover=True)
    
    @cursor.connect("add")
    def on_add(sel):
        if sel.artist in patches1:
            index = patches1.index(sel.artist)
            dataset = label1
            count = n1[index]
        else:
            index = patches2.index(sel.artist)
            dataset = label2
            count = n2[index]
        
        sel.annotation.set_text(
            f'Dataset: {dataset}\n'
            f'B-factor range: {bins[index]:.2f} - {bins[index+1]:.2f}\n'
            f'Count: {count}'
        )
    
    # Save the figure
    output_file = os.path.join(output_dir, 'overlaid_order_parameter_histograms.png')
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close(fig)  # Close the figure to free up memory

    print(f"Saved overlaid B-factor histograms to {output_file}")

def process_datasets(file_path1, file_path2, label1, label2, output_dir):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Read the files
    df1 = pd.read_csv(file_path1)
    df2 = pd.read_csv(file_path2)
    
    # Plot the overlaid B-factor histograms
    plot_overlaid_b_factor_histograms(df1, df2, label1, label2, output_dir)
    
    print("Overlaid Order Parameter histograms created successfully!")

# Specify the paths to your input files, labels, and output directory
file_path1 = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_op_subset_aligned.csv'
file_path2 = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_5_op_subset_aligned.csv'
label1 = 'Apo OP'
label2 = 'Apo OP 5.0 closeres'
output_dir = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_op_histograms_overlay_hi'

# Process the datasets
process_datasets(file_path1, file_path2, label1, label2, output_dir)

Saved overlaid B-factor histograms to /Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_op_histograms_overlay_hi\overlaid_order_parameter_histograms.png
Overlaid Order Parameter histograms created successfully!
