# Genomic Selection Post-Training Analysis & Visualization
This notebook analyzes calibrated predicted breeding values produced by the training pipeline.

## Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr

df = pd.read_csv('predicted_probabilities_with_labels.csv')
models = ['LR','RF','GB','MLP']
df.head()

## 1. Descriptive Statistics & Rank Comparison

In [None]:
# Descriptive statistics for predicted breeding values
stats = df[models].describe().T
display(stats)

# Top 100 individuals for each model and rank comparison
rankings = {}
for model in models:
    top100 = df.nlargest(100, model)[['ID', model]].copy()
    top100['Rank_in_'+model] = np.arange(1, 101)
    for other in models:
        if other != model:
            top100['Rank_in_'+other] = df[other].rank(method='min', ascending=False).loc[top100.index].astype(int)
    rankings[model] = top100

# Example output
rankings['LR'].head()

## 2. Breeding Value Histograms

In [None]:
bin_width = 0.05
bins = np.arange(0, 1.01, bin_width)

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()

for i, model in enumerate(models):
    ax = axes[i]
    sns.histplot(df[model], bins=bins, ax=ax, color='skyblue', edgecolor='black')
    ax.scatter(df[model], np.random.uniform(0, 50, size=len(df)), 
               c=df['true_label'], cmap='bwr', alpha=0.6, marker='|')
    ax.set_title(f'Histogram of Breeding Values - {model}')
    ax.set_xlabel('Predicted Breeding Value')
    ax.set_ylabel('Count')

plt.tight_layout()
plt.show()

## 3. Correlation & Performance Plot

In [None]:
# Compute correlations per fold
correlations = []
for model in models:
    fold_corrs = []
    for fold in df['fold'].unique():
        subset = df[df['fold'] == fold]
        corr, _ = pearsonr(subset['true_label'], subset[model])
        fold_corrs.append(corr)
    correlations.append({
        'Model': model,
        'MeanCorr': np.mean(fold_corrs),
        'SE': np.std(fold_corrs, ddof=1) / np.sqrt(len(fold_corrs))
    })

corr_df = pd.DataFrame(correlations)
display(corr_df)

# Bar chart with error bars
plt.figure(figsize=(8, 6))
plt.bar(corr_df['Model'], corr_df['MeanCorr'], yerr=corr_df['SE'], capsize=5, color='lightgreen')
plt.ylabel('Pearson Correlation (True vs Predicted)')
plt.title('Average Correlation with Survival (10-fold CV)')
plt.show()

## 4. Model Correlation Plot

In [None]:
# Average breeding value per individual across folds
avg_preds = df.groupby('ID')[models].mean()

# Pairwise Pearson correlations
corr_matrix = avg_preds.corr(method='pearson')

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=0, vmax=1)
plt.title('Correlation Between Models (Average Breeding Values)')
plt.show()