In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from scipy import stats
import shutil
import mplcursors

#How to import all B factor values then concantenate all dataframes into a single dataframe from the analysis scripts
#Use glob to find all *_B_factors.csv files in the specified directory
b_factor_files = glob.glob(os.path.join('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/full_protein-20241018T143402Z-001/full_protein', '*_B_factors.csv'))

#Initialize a list to store the DataFrames
b_factor_dfs = []

#Loop through each file and read it into a DataFrame
for file in b_factor_files:
    df = pd.read_csv(file)
    # Optionally, add a new column to identify the source file
    b_factor_dfs.append(df)

#Concatenate all DataFrames into a single DataFrame
combined_b_factor_df = pd.concat(b_factor_dfs, ignore_index=True)
combined_b_factor_df['chain'] = combined_b_factor_df['chain'].str.replace(r"[\[\]']+", '', regex=True).str.strip()
combined_b_factor_df['resn'] = combined_b_factor_df['resn'].str.replace(r"[\[\]']+", '', regex=True).str.strip()

b_factor_data = combined_b_factor_df
b_factor_data.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/b_factor_data.csv')
print("Concantenate all bfactor data into csv file")

#How to combine aspects of the 'Apo' data into a singular file, this example is for combining B factor and Apo data
df = pd.read_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/apo_holo_241016.csv')

#Define criteria for apo_res structure
def apo_df():
    filtered_2_df = df[df['Apo'].notna() & (df['Apo'] != '')]
#Select only the desired columns
    apo_columns = ['Apo']
    filtered_2_df = filtered_2_df[apo_columns]
    return filtered_2_df
apo_data = apo_df()

apo_b_factor_aligned = combined_b_factor_df[combined_b_factor_df['PDB'] .isin (apo_data['Apo'])]
apo_b_factor_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_b_factor_aligned.csv',index=False)
print("Created a subset of all bfactor data to only include from Apo")

#How to separate the _5.0_bfactor_subset files from the entire data set and transfer them into an individual folder
#Define the source and destination directories
source_directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/full_protein-20241018T143402Z-001/full_protein/'
destination_directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/5_bfactor_subset_files/'

#Create the destination directory if it doesn't exist
os.makedirs(destination_directory, exist_ok=True)

#List all files in the source directory
file_list = os.listdir(source_directory)

#Loop through each file and transfer files ending with qFit_sasa.csv to the destination directory
for file in file_list:
    if file.endswith('_5.0_bfactor_subset.csv'):
        source_file_path = os.path.join(source_directory, file)
        destination_file_path = os.path.join(destination_directory, file)
        shutil.move(source_file_path, destination_file_path)

print("Files ending with '5.0_bfactor_subset.csv' have been successfully transferred to the separate folder.")

#How to add the PDB variable to each 5_bfactor_subset file then concantenate all dataframes into a single dataframe from the analysis scripts
#Directory containing the .csv files
directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/5_bfactor_subset_files'

#Initialize an empty list to hold the dataframes
dataframes = []

#List all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        #Extract the base name without extension and remove 
        base_name, ext = os.path.splitext(filename)
        cleaned_base_name = base_name.replace('_5.0_bfactor_subset', '')
     #Read the csv file into a dataframe
        _5_bfactor_subset_df = pd.read_csv(os.path.join(directory, filename))
        
        #Add a new column with the cleaned base name
        _5_bfactor_subset_df['Cleaned Base Name'] = cleaned_base_name
        
        #Append the dataframe to the list
        dataframes.append(_5_bfactor_subset_df)
#Concatenate all dataframes in the list into a single dataframe
combined_5_bfactor_subset_df = pd.concat(dataframes, ignore_index=True)  
combined_5_bfactor_subset_df.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/5_bfactor_subset_files/combined_5_bfactor_subset_df.csv', index=False)
print("combined all bfactor with 5.0 closeres data into single csv")

#How to combine aspects of the 'Apo' data and 5_bfactor_subset into a singular file
df = pd.read_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/apo_holo_241016.csv')

#Define criteria for apo_res structure
def apo_df():
    filtered_2_df = df[df['Apo'].notna() & (df['Apo'] != '')]
#Select only the desired columns
    apo_columns = ['Apo']
    filtered_2_df = filtered_2_df[apo_columns]
    return filtered_2_df
apo_data = apo_df()

apo_5_bfactor_subset_aligned = combined_5_bfactor_subset_df[combined_5_bfactor_subset_df['Cleaned Base Name'] .isin (apo_data['Apo'])]
apo_5_bfactor_subset_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_5_bfactor_subset_aligned.csv',index=False)
print("Created a bfactor 5.0 clsoeres subset to only include Apo structures") 

#Create the histograms of the OP data and normalize the data using z-score
def plot_zscore_normalized_overlaid_b_factor_histograms(df1, df2, label1, label2, output_dir):
    fig, ax = plt.subplots(figsize=(20, 10))
    
    #Calculate normalization parameters from df1 only
    mean_df1 = df1['b_factor'].mean()
    std_df1 = df1['b_factor'].std()
    
    #Z-score normalize both datasets using df1's parameters
    df1_zscore = (df1['b_factor'] - mean_df1) / std_df1
    df2_zscore = (df2['b_factor'] - mean_df1) / std_df1
    
    #Create the histograms with z-scored data
    bins = np.linspace(min(df1_zscore.min(), df2_zscore.min()),
                      max(df1_zscore.max(), df2_zscore.max()),
                      100)
    
    n1, bins1, patches1 = ax.hist(df1_zscore, bins=bins, 
                                 alpha=0.5, label=label1, color='blue', density=True)
    n2, bins2, patches2 = ax.hist(df2_zscore, bins=bins, 
                                 alpha=0.5, label=label2, color='red', density=True)
    
    ax.set_title('Z-Score Normalized B-factor Histograms')
    ax.set_xlabel('b_factor (Z-score)')
    ax.set_ylabel('Density')
    ax.grid(True, linestyle='--', alpha=0.3)
    ax.legend()
    
    plt.tight_layout()
    
    #Add hover annotations
    cursor = mplcursors.cursor(patches1 + patches2, hover=True)
    
    @cursor.connect("add")
    def on_add(sel):
        if sel.artist in patches1:
            index = patches1.index(sel.artist)
            dataset = label1
            density = n1[index]
        else:
            index = patches2.index(sel.artist)
            dataset = label2
            density = n2[index]
        
        sel.annotation.set_text(
            f'Dataset: {dataset}\n'
            f'Z-score range: {bins[index]:.2f} - {bins[index+1]:.2f}\n'
            f'Density: {density:.4f}'
        )
    
    output_file = os.path.join(output_dir, 'zscore_normalized_b_factor_histograms.png')
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close(fig)
    
    print(f"Saved Z-score normalized B-factor histograms to {output_file}")
#Define the file path per dataframe and output directory for histogram
def process_datasets(file_path1, file_path2, label1, label2, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    df1 = pd.read_csv(file_path1)
    df2 = pd.read_csv(file_path2)
    plot_zscore_normalized_overlaid_b_factor_histograms(df1, df2, label1, label2, output_dir)
    print("Z-score normalized B-Factor histograms created successfully!")

# File paths and settings
file_path1 = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_b_factor_aligned.csv'
file_path2 = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_5_bfactor_subset_aligned.csv'
label1 = 'Apo b-factors'
label2 = 'Apo b-factors 5.0 closeres'
output_dir = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_bfactor_histograms_overlay_zscore_normalized'

# Process the datasets
process_datasets(file_path1, file_path2, label1, label2, output_dir)
print("B-factor data analysis complete")

Concantenate all bfactor data into csv file
Created a subset of all bfactor data to only include from Apo
Files ending with '5.0_bfactor_subset.csv' have been successfully transferred to the separate folder.


  combined_5_bfactor_subset_df = pd.concat(dataframes, ignore_index=True)


combined all bfactor with 5.0 closeres data into single csv
Created a bfactor 5.0 clsoeres subset to only include Apo structures
Saved Z-score normalized B-factor histograms to /Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_bfactor_histograms_overlay_zscore_normalized\zscore_normalized_b_factor_histograms.png
Z-score normalized B-Factor histograms created successfully!
B-factor data analysis complete
