In [1]:
## working with soax combine data

In [None]:
# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
file_path = "/Users/hsuyab/Documents/Spring 2024/RA Staiger/SOAX project/Ayush/combined_filtered_data.csv"


In [None]:

# %%
df = pd.read_csv(file_path)

# %%
df.head()

# %%
#create a list of tuples of (ridge, stretch) values
ridges = df['ridge'].values
stretches = df['stretch'].values

#zip them together as tuple
rs_params = list(zip(ridges, stretches))


# %%
#use collections to get a count frequency of each tuple
from collections import Counter
rs_params_count = Counter(rs_params)
print(rs_params_count)

# %%
#get a list of unique tuples and store it in 'params'
params = list(rs_params_count.keys())
print(params)

# %%
df.head()

# %% [markdown]
# ## Data Analaysis

# %%
#read the comparison dataset
df2 = pd.read_csv("/Users/hsuyab/Documents/Spring 2024/RA Staiger/SOAX project/Atharva/SOAX analysis/ComparisonDataset_final.csv")
#selecting the columns to filter the df2 data based upon
filterCols = ['filename','ridge', 'stretch','Original Count','Original Median Length',  'Original Median Intensity',
'Obtained Count', 'Obtained Median Length', 'Obtained Median Intensity']
#filtering the data
data = df2[filterCols].copy()

#create a list of tuples of (ridge, stretch) values
ridges = data['ridge'].values
stretches = data['stretch'].values

#zip them together as tuple
rs_params = list(zip(ridges, stretches))
#count frequency of each tuple
rs_params_count = Counter(rs_params)
#get a list of unique tuples and store it in 'params'
params = list(rs_params_count.keys())

# %%
data.shape

# %%
data.groupby(['ridge', 'stretch']).size()

# %%
data.groupby(['filename']).size()

# %%
from sklearn.metrics import mean_squared_error
import numpy as np

# Function to calculate RMSE
def calculate_rmse(original, obtained):
    return np.sqrt(mean_squared_error(original, obtained))

# Calculate RMSE for Count, Length, and Intensity
data['RMSE Count'] = calculate_rmse(data['Original Count'], data['Obtained Count'])
data['RMSE Length'] = calculate_rmse(data['Original Median Length'], data['Obtained Median Length'])
data['RMSE Intensity'] = calculate_rmse(data['Original Median Intensity'], data['Obtained Median Intensity'])

# %%
import matplotlib.pyplot as plt
import seaborn as sns

# For heatmap, we need to pivot the table to get ridge and stretch as axes and RMSE values as data
# Preparing datasets for heatmaps
def prepare_heatmap_data(data, value_column):
    heatmap_data = data.groupby(['ridge', 'stretch'])[value_column].mean().unstack().fillna(0)
    return heatmap_data

# # Prepare heatmap data for %RMSE Error of Count, Length, and Intensity
# heatmap_count = prepare_heatmap_data(data, '%RMSE Count')
# heatmap_length = prepare_heatmap_data(data, '%RMSE Length')
# heatmap_intensity = prepare_heatmap_data(data, '%RMSE Intensity')

# %%
# Correcting approach to calculate RMSE errors correctly for each parameter set

# Function to calculate RMSE and then percentage RMSE for Count, Length, Intensity for each unique set of parameters
def calculate_percent_rmse(data):
    data['RMSE Count'] = (data['Obtained Count'] - data['Original Count']) ** 2
    data['RMSE Length'] = (data['Obtained Median Length'] - data['Original Median Length']) ** 2
    data['RMSE Intensity'] = (data['Obtained Median Intensity'] - data['Original Median Intensity']) ** 2
    
    # Aggregating RMSE by filename and then calculating percentage
    agg_data = data.groupby(['filename', 'ridge', 'stretch']).agg({
        'Original Count': 'mean',
        'Original Median Length': 'mean',
        'Original Median Intensity': 'mean',
        'RMSE Count': 'mean',
        'RMSE Length': 'mean',
        'RMSE Intensity': 'mean'
    }).reset_index()
    
    agg_data['%RMSE Count'] = np.sqrt(agg_data['RMSE Count']) / agg_data['Original Count'] * 100
    agg_data['%RMSE Length'] = np.sqrt(agg_data['RMSE Length']) / agg_data['Original Median Length'] * 100
    agg_data['%RMSE Intensity'] = np.sqrt(agg_data['RMSE Intensity']) / agg_data['Original Median Intensity'] * 100
    
    return agg_data

agg_data = calculate_percent_rmse(data)

# Preparing heatmap data for %RMSE Error of Count, Length, and Intensity from aggregated data
heatmap_data_count = prepare_heatmap_data(agg_data, '%RMSE Count')
heatmap_data_length = prepare_heatmap_data(agg_data, '%RMSE Length')
heatmap_data_intensity = prepare_heatmap_data(agg_data, '%RMSE Intensity')

# Plotting corrected heatmaps
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

sns.heatmap(heatmap_data_count, ax=axes[0], cmap="viridis", cbar_kws={'label': '%RMSE Error'})
axes[0].set_title('%RMSE Error for Count')
axes[0].set_xlabel('Stretch')
axes[0].set_ylabel('Ridge')

sns.heatmap(heatmap_data_length, ax=axes[1], cmap="viridis", cbar_kws={'label': '%RMSE Error'})
axes[1].set_title('%RMSE Error for Length')
axes[1].set_xlabel('Stretch')
axes[1].set_ylabel('Ridge')

sns.heatmap(heatmap_data_intensity, ax=axes[2], cmap="viridis", cbar_kws={'label': '%RMSE Error'})
axes[2].set_title('%RMSE Error for Intensity')
axes[2].set_xlabel('Stretch')
axes[2].set_ylabel('Ridge')

plt.tight_layout()
plt.show()


# %%

# # Calculate percentage RMSE for each filename type
# percent_rmse = data.groupby('filename').apply(lambda x: pd.Series({
#     '%RMSE Count': 100 * x['RMSE Count'].mean() / x['Original Count'].mean(),
#     '%RMSE Length': 100 * x['RMSE Length'].mean() / x['Original Median Length'].mean(),
#     '%RMSE Intensity': 100 * x['RMSE Intensity'].mean() / x['Original Median Intensity'].mean()
# })).reset_index()

# percent_rmse.head()


# %%
# data

# %%
import plotly.figure_factory as ff

# Example for creating an interactive heatmap with Plotly for the %RMSE Count
# Assume 'heatmap_data_count' is your DataFrame with the error values, indexed by 'ridge' and columns as 'stretch'
z = heatmap_data_count.values  # The matrix of values
x = heatmap_data_count.columns.tolist()  # stretch values
y = heatmap_data_count.index.tolist()  # ridge values

# Creating the heatmap
fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=np.around(z, decimals=2), colorscale='Viridis')

# Adding titles and labels
fig.update_layout(title='%RMSE Error for Count', xaxis=dict(title='Stretch'), yaxis=dict(title='Ridge'))

# Show the plot
fig.show()


# %%
import plotly.graph_objects as go

# Assume 'heatmap_data_*' are your DataFrame objects for each RMSE error type
z_count = heatmap_data_count.values
z_length = heatmap_data_length.values
z_intensity = heatmap_data_intensity.values

x = heatmap_data_count.columns.tolist()  # Should be the same for all three DataFrames
y = heatmap_data_count.index.tolist()  # Ditto

# Initial heatmap
fig = go.Figure(data=go.Heatmap(z=z_count, x=x, y=y, colorscale='Viridis'))

# Update the layout to add dropdown
fig.update_layout(
    updatemenus=[
        dict(
            buttons=list([
                dict(
                    args=[{'z': [z_count], 'x': [x], 'y': [y]}],
                    label="Count",
                    method="update"
                ),
                dict(
                    args=[{'z': [z_length], 'x': [x], 'y': [y]}],
                    label="Length",
                    method="update"
                ),
                dict(
                    args=[{'z': [z_intensity], 'x': [x], 'y': [y]}],
                    label="Intensity",
                    method="update"
                ),
            ]),
            direction="down",
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.1,
            yanchor="top"
        ),
    ]
)

# Add titles and labels
fig.update_layout(title="RMSE Errors: Count, Length, Intensity", xaxis=dict(title='Stretch'), yaxis=dict(title='Ridge'))

# Show the plot
fig.show()


# %%
# Filtering the aggregated data for filenames containing "BB" and "AB" specifically
data_bb = agg_data[agg_data['filename'].str.contains('BB')]
data_ab = agg_data[agg_data['filename'].str.contains('AB')]

# Preparing heatmap data specifically for "BB" and "AB" filenames
heatmap_data_count_bb = prepare_heatmap_data(data_bb, '%RMSE Count')
heatmap_data_length_bb = prepare_heatmap_data(data_bb, '%RMSE Length')
heatmap_data_intensity_bb = prepare_heatmap_data(data_bb, '%RMSE Intensity')

heatmap_data_count_ab = prepare_heatmap_data(data_ab, '%RMSE Count')
heatmap_data_length_ab = prepare_heatmap_data(data_ab, '%RMSE Length')
heatmap_data_intensity_ab = prepare_heatmap_data(data_ab, '%RMSE Intensity')

# Plotting heatmaps for "BB" filenames
fig, axes = plt.subplots(3, 2, figsize=(12, 18), sharex='col', sharey='row')

sns.heatmap(heatmap_data_count_bb, ax=axes[0, 0], cmap="viridis", cbar_kws={'label': '%RMSE Error'})
axes[0, 0].set_title('BB: %RMSE Error for Count')
axes[0, 0].set_xlabel('Stretch')
axes[0, 0].set_ylabel('Ridge')

sns.heatmap(heatmap_data_length_bb, ax=axes[1, 0], cmap="viridis", cbar_kws={'label': '%RMSE Error'})
axes[1, 0].set_title('BB: %RMSE Error for Length')
axes[1, 0].set_xlabel('Stretch')
axes[1, 0].set_ylabel('Ridge')

sns.heatmap(heatmap_data_intensity_bb, ax=axes[2, 0], cmap="viridis", cbar_kws={'label': '%RMSE Error'})
axes[2, 0].set_title('BB: %RMSE Error for Intensity')
axes[2, 0].set_xlabel('Stretch')
axes[2, 0].set_ylabel('Ridge')

# Plotting heatmaps for "AB" filenames
sns.heatmap(heatmap_data_count_ab, ax=axes[0, 1], cmap="viridis", cbar_kws={'label': '%RMSE Error'})
axes[0, 1].set_title('AB: %RMSE Error for Count')
axes[0, 1].set_xlabel('Stretch')
axes[0, 1].set_ylabel('Ridge')

sns.heatmap(heatmap_data_length_ab, ax=axes[1, 1], cmap="viridis", cbar_kws={'label': '%RMSE Error'})
axes[1, 1].set_title('AB: %RMSE Error for Length')
axes[1, 1].set_xlabel('Stretch')
axes[1, 1].set_ylabel('Ridge')

sns.heatmap(heatmap_data_intensity_ab, ax=axes[2, 1], cmap="viridis", cbar_kws={'label': '%RMSE Error'})
axes[2, 1].set_title('AB: %RMSE Error for Intensity')
axes[2, 1].set_xlabel('Stretch')
axes[2, 1].set_ylabel('Ridge')

plt.tight_layout()
plt.show()


# %% [markdown]
# ## Plotting HeatMap for BB

# %% [markdown]
# ### Count BB

# %%
# Plotting heatmaps separately for "BB" filenames

# %RMSE Error for Count - BB
plt.figure(figsize=(8, 6))
sns.heatmap(heatmap_data_count_bb, cmap="viridis", cbar_kws={'label': '%RMSE Error'})
plt.title('BB: %RMSE Error for Count')
plt.xlabel('Stretch')
plt.ylabel('Ridge')
plt.show()

# %% [markdown]
# ### Length BB

# %%

# %RMSE Error for Length - BB
plt.figure(figsize=(8, 6))
sns.heatmap(heatmap_data_length_bb, cmap="viridis", cbar_kws={'label': '%RMSE Error'})
plt.title('BB: %RMSE Error for Length')
plt.xlabel('Stretch')
plt.ylabel('Ridge')
plt.show()

# %% [markdown]
# ### Intensity BB

# %%

# %RMSE Error for Intensity - BB
plt.figure(figsize=(8, 6))
sns.heatmap(heatmap_data_intensity_bb, cmap="viridis", cbar_kws={'label': '%RMSE Error'})
plt.title('BB: %RMSE Error for Intensity')
plt.xlabel('Stretch')
plt.ylabel('Ridge')
plt.show()

# %% [markdown]
# ## Plotting HeatMap for AB

# %% [markdown]
# ### Count AB

# %%

# Plotting heatmaps separately for "AB" filenames

# %RMSE Error for Count - AB
plt.figure(figsize=(8, 6))
sns.heatmap(heatmap_data_count_ab, cmap="viridis", cbar_kws={'label': '%RMSE Error'})
plt.title('AB: %RMSE Error for Count')
plt.xlabel('Stretch')
plt.ylabel('Ridge')
plt.show()


# %% [markdown]
# ### Length AB

# %%

# %RMSE Error for Length - AB
plt.figure(figsize=(8, 6))
sns.heatmap(heatmap_data_length_ab, cmap="viridis", cbar_kws={'label': '%RMSE Error'})
plt.title('AB: %RMSE Error for Length')
plt.xlabel('Stretch')
plt.ylabel('Ridge')
plt.show()

# %% [markdown]
# ### Intensity AB

# %%

# %RMSE Error for Intensity - AB
plt.figure(figsize=(8, 6))
sns.heatmap(heatmap_data_intensity_ab, cmap="viridis", cbar_kws={'label': '%RMSE Error'})
plt.title('AB: %RMSE Error for Intensity')
plt.xlabel('Stretch')
plt.ylabel('Ridge')
plt.show()


# %%
# Function to find the params that minimize the error
def find_min_error_params(data):
    min_error_count = data.loc[data['%RMSE Count'].idxmin(), ['ridge', 'stretch', '%RMSE Count']]
    min_error_length = data.loc[data['%RMSE Length'].idxmin(), ['ridge', 'stretch', '%RMSE Length']]
    min_error_intensity = data.loc[data['%RMSE Intensity'].idxmin(), ['ridge', 'stretch', '%RMSE Intensity']]
    return {
        'Count': min_error_count,
        'Length': min_error_length,
        'Intensity': min_error_intensity
    }

# Find the params for "BB"
min_error_params_bb = find_min_error_params(data_bb)

# Find the params for "AB"
min_error_params_ab = find_min_error_params(data_ab)

min_error_params_bb, min_error_params_ab


# %% [markdown]
# For minimizing the %RMSE Error, here are the selected parameter values `(ridge, stretch)` for "BB" and "AB" filenames separately:
# 
# ### "BB" Filenames
# - **Count:**
#   - Ridge: 0.012
#   - Stretch: 0.7
#   - %RMSE Count: 0.0%
# - **Length:**
#   - Ridge: 0.034
#   - Stretch: 0.8
#   - %RMSE Length: 0.053%
# - **Intensity:**
#   - Ridge: 0.012
#   - Stretch: 0.9
#   - %RMSE Intensity: 0.00159%
# 
# ### "AB" Filenames
# - **Count:**
#   - Ridge: 0.01
#   - Stretch: 0.7
#   - %RMSE Count: 0.0%
# - **Length:**
#   - Ridge: 0.044
#   - Stretch: 1.0
#   - %RMSE Length: 0.021%
# - **Intensity:**
#   - Ridge: 0.012
#   - Stretch: 0.5
#   - %RMSE Intensity: 0.0145%
# 
# These parameter values represent the optimal settings for minimizing the errors in Count, Length, and Intensity for datasets with filenames containing "BB" and "AB" specifically. You can use these parameters to guide your analysis or setup for further experiments. If there's anything else you'd like to explore or need further clarification on, feel free to ask!

# %%
# Function to calculate combined error and find the params with the lowest combined error
def find_lowest_combined_error(data):
    data['Combined %RMSE Error'] = data[['%RMSE Count', '%RMSE Length', '%RMSE Intensity']].mean(axis=1)
    min_combined_error = data.loc[data['Combined %RMSE Error'].idxmin(), ['ridge', 'stretch', 'Combined %RMSE Error']]
    return min_combined_error

# Calculate and find the lowest combined error params for "BB"
lowest_combined_error_bb = find_lowest_combined_error(data_bb)

# Calculate and find the lowest combined error params for "AB"
lowest_combined_error_ab = find_lowest_combined_error(data_ab)

lowest_combined_error_bb, lowest_combined_error_ab


# %%
data_ab[(data_ab['ridge']==0.014)&(data_ab['stretch']==0.5)]

# %%
# Preparing heatmap data for the combined %RMSE Error for "BB" and "AB" filenames
heatmap_data_combined_bb = prepare_heatmap_data(data_bb, 'Combined %RMSE Error')
heatmap_data_combined_ab = prepare_heatmap_data(data_ab, 'Combined %RMSE Error')

# Plotting heatmap for the combined %RMSE Error - "BB" filenames
plt.figure(figsize=(8, 6))
sns.heatmap(heatmap_data_combined_bb, cmap="viridis", cbar_kws={'label': 'Combined %RMSE Error'})
plt.title('BB: Combined %RMSE Error')
plt.xlabel('Stretch')
plt.ylabel('Ridge')
plt.show()

# %%

# Plotting heatmap for the combined %RMSE Error - "AB" filenames
plt.figure(figsize=(8, 6))
sns.heatmap(heatmap_data_combined_ab, cmap="viridis", cbar_kws={'label': 'Combined %RMSE Error'})
plt.title('AB: Combined %RMSE Error')
plt.xlabel('Stretch')
plt.ylabel('Ridge')
plt.show()


# %% [markdown]
# To find a set of parameters that minimizes the error across all three metrics (Count, Length, and Intensity) simultaneously for each filename group ("BB" and "AB"), a heuristic or optimization approach can be used. A common and straightforward heuristic method is to calculate a combined error score for each parameter set and then choose the one with the lowest score. 
# 
# One way to define a combined error score could be to simply average the %RMSE for Count, Length, and Intensity, assuming equal importance of each metric. Alternatively, if certain metrics are more important than others, you could apply weights to each %RMSE before averaging.
# 
# Let's calculate a simple average combined error for each parameter set for both "BB" and "AB" filenames and identify the parameters with the lowest combined error.
# 
# By calculating a simple average of the %RMSE for Count, Length, and Intensity as a combined error score, we've identified the parameter sets that minimize the error across all three metrics for both "BB" and "AB" filenames:
# 
# ### "BB" Filenames
# - Ridge: 0.014
# - Stretch: 0.5
# - Combined %RMSE Error: 3.76%
# 
# ### "AB" Filenames
# - Ridge: 0.018
# - Stretch: 1.0
# - Combined %RMSE Error: 4.10%
# 
# These parameters represent the best compromise across all three metrics for minimizing error in the datasets with filenames containing "BB" and "AB", respectively. This heuristic approach provides a balanced solution, considering the performance across all metrics simultaneously. 

# %%
# data ab
print(f"Ridge AB: {lowest_combined_error_ab['ridge']}, Stretch AB: {lowest_combined_error_ab['stretch']}")
#filtered data
data_ab[(data_ab['ridge']==lowest_combined_error_ab['ridge'])&(data_ab['stretch']==lowest_combined_error_ab['stretch'])]

# %%
# data bb
print(f"Ridge BB: {lowest_combined_error_bb['ridge']}, Stretch BB: {lowest_combined_error_bb['stretch']}")
#filtered data
data_bb[(data_bb['ridge']==lowest_combined_error_bb['ridge'])&(data_bb['stretch']==lowest_combined_error_bb['stretch'])]

# %%


# %%



