# Step 3: Cluster Distribution Statistical Analysis
Calculate statstical information on cluster distributions isolated in the 'Step 2' Cluster Network Isolation notebook. 

---
Here, we can compute information such as:

- Coordination Number Distributions
- Bond Length, Angle Distributions
- Volume Distributions
- Cluster Atom Count Distributions
- Volume Fraction (Scattering) Contributions

## Custom Imports
Relative import the custom classes to support the cluster network tool.

In [None]:
# Import and run the setup script
import sys, os
import numpy as np

# Ensure the project root is in sys.path to locate setup_env
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import the setup helper and configure the environment
from setup_env import setup_environment

# Capture the imported classes from setup_environment
PDBEditor, TrajectoryProcessor, PDBFileHandler, Atom, ClusterNetwork, ClusterBatchAnalyzer = setup_environment()

## Cluster Single Frame Analysis

### Bond Length and Angle Distributions

In [None]:
# Define the path to the PDB file and initialize the handler
pdb_file_path = '/Users/keithwhite/repos/MDScatter/data/example/frame_0001.pdb'
core_residue_names = ['PBI']
shell_residue_names = ['DMS']
pdb_handler = PDBFileHandler(pdb_file_path, core_residue_names, shell_residue_names)

# Initialize ClusterNetwork with relevant parameters
node_elements = ['Pb']
linker_elements = ['Pb', 'I']
terminator_elements = ['I']
segment_cutoff = 3.7
cluster_network = ClusterNetwork(
    pdb_handler.core_atoms,
    pdb_handler.shell_atoms,
    node_elements,
    linker_elements,
    terminator_elements,
    segment_cutoff,
    core_residue_names,
    shell_residue_names
)

# Analyze networks
networks = cluster_network.analyze_networks()

# Specify bond lengths and angles to calculate
# bond_length_pairs = [('Pb', 'I', 3.8), ('I', 'I', 7.6)]
bond_length_pairs = [('Pb', 'I', 3.8)] #, ('I', 'I', 7.6)]
bond_angle_triplets = [('I', 'Pb', 'I', 3.8)]

# Calculate and plot distributions
cluster_network.calculate_and_plot_distributions(bond_length_pairs, bond_angle_triplets)

### Coordination Number Distributions

In [None]:
# Calculate coordination numbers
target_elements = ['Pb']
neighbor_elements = ['I', 'O']
distance_thresholds = {('Pb', 'I'): 3.8, ('Pb', 'O'): 3}
coordination_stats, total_stats = cluster_network.calculate_coordination_numbers(target_elements, neighbor_elements, distance_thresholds)

# Print coordination numbers
cluster_network.print_coordination_numbers(coordination_stats, total_stats)

### Coordination Number Heatmap

In [None]:
# Define parameters for the heatmap
central_element = 'Pb'
x_pair = ('Pb', 'I')
y_pair = ('Pb', 'O')
x_range = (1, 6)  # Range for Pb-I coordination number
y_range = (1, 6)  # Range for Pb-O coordination number
distance_cutoffs = {('Pb', 'I'): 3.6, ('Pb', 'O'): 3}

# Calculate and plot the heatmap
cluster_network.calculate_and_plot_heatmap(central_element, x_pair, y_pair, x_range, y_range, distance_cutoffs)


## Cluster Batch Frame Analysis

### Setup
Checking coordination numbers and elements in Mendeleev data repository to help with input fields for cluster analyzer.

In [None]:
from mendeleev import element

# Retrieve the element information for oxygen
oxygen = element('I')

# Fetch the ionic radii for oxygen in different oxidation states
ionic_radii = oxygen.ionic_radii

# Print the ionic radii for each oxidation state
for ir in ionic_radii:
    print(f"Oxidation State: {ir.charge}, Coordination: {ir.coordination}, Ionic Radius: {ir.ionic_radius} pm")


### Calculate Statistics on Cluster Population

In [None]:
%matplotlib widget

# Define parameters for analysis
pdb_directory = '/Users/keithwhite/repos/run_fullrmc/md_sim/travis_md_crnb/PbI2_DMSO_0.8M_cr_nb/clusters_pdb'
target_elements = ['Pb']
neighbor_elements = ['O', 'I']
distance_thresholds = {
    ('Pb', 'O'): 3.0,  # Example threshold distances in angstroms
    ('Pb', 'I'): 3.6
}

# Define the charge dictionary, ensuring all charges are integers
partial_charges = {
    'Pb': (2, 6),    # Lead with a charge of 2+ and coordination number of 6
    'I': (-1, 6),    # Iodine with a charge of 1- and coordination number of 6
    'S': (-2, 6),    # Sulfur in DMSO with a neutral charge and coordination number of 2
    'O': (-2, 2),    # Oxygen in DMSO with a charge of 2- and coordination number of 2
    'C': (4, 4),     # Carbon in DMSO with a neutral charge and coordination number of 4
    'H': (1, 1)      # Hydrogen in DMSO with a neutral charge and coordination number of 1
}

# Instantiate the ClusterBatchAnalyzer class with the necessary parameters
analyzer = ClusterBatchAnalyzer(
    pdb_directory=pdb_directory,
    target_elements=target_elements,
    neighbor_elements=neighbor_elements,
    distance_thresholds=distance_thresholds,
    charges=partial_charges,
    core_residue_names=['PBI'], 
    shell_residue_names=['DMS'],
    volume_method='radius_of_gyration'  # Choose the radius of gyration method
)

# Step 1: Analyze clusters to compute coordination numbers and volumes
# Specify shape_type='sphere' to use spherical approximation with radius of gyration
coordination_stats_per_size = analyzer.analyze_clusters(shape_type='sphere')
# coordination_stats_per_size = analyzer.analyze_clusters(shape_type='ellipsoid')

# Step 2: Define a range of q-values in inverse angstroms
q_values = np.linspace(0.01, 1.4, 1000)

# Step 3: Plot the total I(q) vs. q on a log-log scale
analyzer.plot_total_iq(q_values)
analyzer.save_total_iq(q_values)

# Additional Steps (Optional):
# If you want to visualize the average volume vs. cluster size using the radius of gyration,
# you can call the corresponding plot method:
analyzer.plot_average_volume_vs_cluster_size_rg()
