In [6]:
import pandas as pd
import numpy as np

# ===============================================
# STAGE 1: ELIMINATING NEGATIVE VALUES
# ===============================================

address = r'/work/bavarian/hsafari2/Manuscript Code/CoPolDB.xlsx'
# ↑ Added the missing closing quote here

# Load the original dataset
df_original = pd.read_excel(address)

print("STAGE 1: ELIMINATING NEGATIVE VALUES")
print(f"Initial dataset: {len(df_original)} entries")

# Check for negative values
negative_entries = len(df_original[(df_original['r1'] < 0) | (df_original['r2'] < 0)])
print(f"Entries with negative values: {negative_entries}")

# Remove entries with negative reactivity ratios
df_stage1 = df_original[(df_original['r1'] >= 0) & (df_original['r2'] >= 0)].copy()

# Report results
entries_removed = len(df_original) - len(df_stage1)
percentage_removed = (entries_removed / len(df_original)) * 100

print(f"Entries removed: {entries_removed} ({percentage_removed:.2f}%)")
print(f"Remaining entries: {len(df_stage1)}")

# Optional: Save the filtered dataset
#df_stage1.to_excel('CopolDB_Stage1_NoNegatives.xlsx', index=False)



STAGE 1: ELIMINATING NEGATIVE VALUES
Initial dataset: 2991 entries
Entries with negative values: 296
Entries removed: 296 (9.90%)
Remaining entries: 2695


In [2]:
# ===============================================
# STAGE 2: REMOVING MULTI-VINYL MONOMERS
# ===============================================

from rdkit import Chem

def count_vinyl_groups(smiles):
    """Count non-aromatic vinyl (C=C) groups in a molecule"""
    if pd.isna(smiles):
        return 0
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return 0
        
        # Identify aromatic atoms
        aromatic_atoms = set()
        for atom in mol.GetAtoms():
            if atom.GetIsAromatic():
                aromatic_atoms.add(atom.GetIdx())
        
        # Count non-aromatic C=C double bonds
        vinyl_count = 0
        for bond in mol.GetBonds():
            if (bond.GetBondType() == Chem.BondType.DOUBLE and 
                mol.GetAtomWithIdx(bond.GetBeginAtomIdx()).GetSymbol() == 'C' and
                mol.GetAtomWithIdx(bond.GetEndAtomIdx()).GetSymbol() == 'C'):
                
                begin_idx = bond.GetBeginAtomIdx()
                end_idx = bond.GetEndAtomIdx()
                
                # Only count if both carbons are non-aromatic
                if begin_idx not in aromatic_atoms and end_idx not in aromatic_atoms:
                    vinyl_count += 1
        return vinyl_count
    except:
        return 0

print("STAGE 2: REMOVING MULTI-VINYL MONOMERS")
print(f"Starting entries: {len(df_stage1)} entries")

# Initialize counters
multi_vinyl_entries = 0
valid_rows = []

# Process each row individually
for index, row in df_stage1.iterrows():
    vinyl_a = count_vinyl_groups(row['SMILES_A'])
    vinyl_b = count_vinyl_groups(row['SMILES_B'])
    
    # Check if both monomers have exactly 1 vinyl group
    if vinyl_a == 1 and vinyl_b == 1:
        valid_rows.append(row)
    else:
        multi_vinyl_entries += 1

print(f"Entries with multi-vinyl monomers: {multi_vinyl_entries}")

# Create new dataframe from valid rows
df_stage2 = pd.DataFrame(valid_rows).reset_index(drop=True)

# Report results
entries_removed = len(df_stage1) - len(df_stage2)
percentage_removed = (entries_removed / len(df_stage1)) * 100

print(f"Entries removed: {entries_removed} ({percentage_removed:.2f}%)")
print(f"Remaining entries: {len(df_stage2)}")

# Optional: Save the filtered dataset
#df_stage2.to_excel('CopolDB_Stage2_SingleVinyl.xlsx', index=False)

STAGE 2: REMOVING MULTI-VINYL MONOMERS
Starting entries: 2695 entries
Entries with multi-vinyl monomers: 330
Entries removed: 330 (12.24%)
Remaining entries: 2365


In [3]:
# ===============================================
# STAGE 3: HANDLING REPEATED MONOMER PAIRS
# ===============================================

def calculate_normalized_variance(values):
    """
    Calculate normalized variance (σ²/μ²)
    
    This gives us a dimensionless measure of how much the experimental 
    values vary relative to their average. For example:
    - If mean = 1.0 and variance = 0.04, normalized variance = 0.04/1.0² = 0.04
    - If mean = 10.0 and variance = 4.0, normalized variance = 4.0/10.0² = 0.04
    Both have the same relative variability despite different absolute scales.
    """
    if len(values) <= 1:
        return 0.0  # Can't calculate variance with single value
    
    mean_val = np.mean(values)
    if mean_val == 0:
        return float('inf')  # Avoid division by zero
    
    # Use sample variance (ddof=1) for unbiased estimation
    variance = np.var(values, ddof=1)
    normalized_variance = variance / (mean_val ** 2)
    
    return normalized_variance

print("STAGE 3: HANDLING REPEATED MONOMER PAIRS")
print(f"Starting entries: {len(df_stage2)} entries")

# Step 1: Group entries by exact monomer pair identity
print("\nStep 1: Identifying unique monomer pairs...")

pair_groups = {}
for index, row in df_stage2.iterrows():
    # Create unique identifier: each (MonomerA, MonomerB) combination
    pair_key = (row['MonomerA'], row['MonomerB'])
    
    if pair_key not in pair_groups:
        pair_groups[pair_key] = []
    
    # Store complete row information for this pair
    pair_groups[pair_key].append({
        'MonomerA': row['MonomerA'],
        'MonomerB': row['MonomerB'], 
        'SMILES_A': row['SMILES_A'],
        'SMILES_B': row['SMILES_B'],
        'r1': row['r1'],
        'r2': row['r2']
    })

total_unique_pairs = len(pair_groups)
print(f"Unique monomer pairs identified: {total_unique_pairs}")

# Step 2: Analyze measurement distribution patterns
print("\nStep 2: Analyzing measurement patterns...")

single_measurement_pairs = 0
multiple_measurement_pairs = 0
measurement_distribution = {}

for pair_key, measurements in pair_groups.items():
    num_measurements = len(measurements)
    
    # Track how many measurements each pair has
    if num_measurements not in measurement_distribution:
        measurement_distribution[num_measurements] = 0
    measurement_distribution[num_measurements] += 1
    
    # Categorize pairs
    if num_measurements == 1:
        single_measurement_pairs += 1
    else:
        multiple_measurement_pairs += 1

print(f"Pairs with single measurements: {single_measurement_pairs}")
print(f"Pairs with multiple measurements: {multiple_measurement_pairs}")

# Show distribution of measurement counts
print(f"\nMeasurement count distribution:")
for count in sorted(measurement_distribution.keys()):
    print(f"  {measurement_distribution[count]} pairs have {count} measurement(s)")

# Step 3: Process pairs with multiple measurements using variance analysis
print(f"\nStep 3: Applying normalized variance filter (threshold = 0.2)...")

variance_threshold = 0.2
valid_single_pairs = []        # Pairs with one measurement (automatically valid)
valid_averaged_pairs = []      # Pairs with multiple measurements that pass variance test  
excluded_high_variance = []    # Pairs excluded due to high variance

# Detailed analysis for manuscript reporting
total_multiple_measurements_processed = 0

for pair_key, measurements in pair_groups.items():
    
    if len(measurements) == 1:
        # Single measurement: no variance to calculate, automatically include
        valid_single_pairs.append(measurements[0])
        
    else:
        # Multiple measurements: apply variance analysis
        total_multiple_measurements_processed += 1
        
        # Extract r1 and r2 values from all measurements of this pair
        r1_values = [m['r1'] for m in measurements]
        r2_values = [m['r2'] for m in measurements]
        
        # Calculate normalized variance for both reactivity ratios
        r1_norm_variance = calculate_normalized_variance(r1_values)
        r2_norm_variance = calculate_normalized_variance(r2_values)
        
        # Apply dual threshold: BOTH r1 and r2 must have low variance
        if r1_norm_variance < variance_threshold and r2_norm_variance < variance_threshold:
            # Passed variance test: average the values
            averaged_entry = measurements[0].copy()  # Use first entry as template
            averaged_entry['r1'] = np.mean(r1_values)
            averaged_entry['r2'] = np.mean(r2_values)
            valid_averaged_pairs.append(averaged_entry)
            
        else:
            # Failed variance test: exclude as unreliable
            excluded_high_variance.append({
                'pair': pair_key,
                'measurements': len(measurements),
                'r1_variance': r1_norm_variance,
                'r2_variance': r2_norm_variance
            })

# Step 4: Combine all valid pairs into final dataset
all_reliable_pairs = valid_single_pairs + valid_averaged_pairs
df_stage3 = pd.DataFrame(all_reliable_pairs).reset_index(drop=True)

# Comprehensive reporting for manuscript
print(f"\nStep 4: Final results summary...")
print(f"Single measurement pairs (kept): {len(valid_single_pairs)}")
print(f"Multiple measurement pairs processed: {total_multiple_measurements_processed}")
print(f"Multiple measurement pairs that passed variance test: {len(valid_averaged_pairs)}")
print(f"Multiple measurement pairs excluded (high variance): {len(excluded_high_variance)}")

# Calculate removal statistics
total_entries_removed = len(df_stage2) - len(df_stage3)
percentage_removed = (total_entries_removed / len(df_stage2)) * 100
pairs_excluded_percentage = (len(excluded_high_variance) / total_multiple_measurements_processed) * 100 if total_multiple_measurements_processed > 0 else 0

print(f"\nOverall Stage 3 impact:")
print(f"Total entries removed: {total_entries_removed} ({percentage_removed:.2f}%)")
print(f"Reliable unique pairs retained: {len(df_stage3)}")
print(f"Percentage of multiple-measurement pairs excluded: {pairs_excluded_percentage:.1f}%")

# Optional: Save the reliable unique pairs dataset
#df_stage3.to_excel('CopolDB_Stage3_ReliableUnique.xlsx', index=False)

STAGE 3: HANDLING REPEATED MONOMER PAIRS
Starting entries: 2365 entries

Step 1: Identifying unique monomer pairs...
Unique monomer pairs identified: 1551

Step 2: Analyzing measurement patterns...
Pairs with single measurements: 1175
Pairs with multiple measurements: 376

Measurement count distribution:
  1175 pairs have 1 measurement(s)
  229 pairs have 2 measurement(s)
  66 pairs have 3 measurement(s)
  28 pairs have 4 measurement(s)
  19 pairs have 5 measurement(s)
  10 pairs have 6 measurement(s)
  8 pairs have 7 measurement(s)
  3 pairs have 8 measurement(s)
  2 pairs have 9 measurement(s)
  4 pairs have 10 measurement(s)
  1 pairs have 12 measurement(s)
  2 pairs have 13 measurement(s)
  1 pairs have 14 measurement(s)
  1 pairs have 17 measurement(s)
  1 pairs have 20 measurement(s)
  1 pairs have 40 measurement(s)

Step 3: Applying normalized variance filter (threshold = 0.2)...

Step 4: Final results summary...
Single measurement pairs (kept): 1175
Multiple measurement pairs p

In [4]:
# ===============================================
# STAGE 4: DATA AUGMENTATION USING SYMMETRY
# ===============================================

print("STAGE 4: DATA AUGMENTATION USING SYMMETRY")
print(f"Starting with: {len(df_stage3)} reliable unique pairs")

# Check the actual columns in the dataframe
print(f"\nDataFrame columns: {list(df_stage3.columns)}")

# Initialize lists to store original and swapped data
augmented_data = []

print("\nProcessing each monomer pair for symmetry augmentation...")
print("Format: Original -> Swapped")
print("-" * 100)

# Process each row for augmentation
for index, row in df_stage3.iterrows():
    
    # Step 1: Add the original entry as-is
    original_entry = {
        'MonomerA': row['MonomerA'],
        'MonomerB': row['MonomerB'],
        'SMILES_A': row['SMILES_A'],
        'SMILES_B': row['SMILES_B'],
        'r1': row['r1'],
        'r2': row['r2'],
        'entry_type': 'original'  # Track entry type for verification
    }
    augmented_data.append(original_entry)
    
    # Step 2: Create the swapped entry leveraging copolymerization symmetry
    # In copolymerization: if (A,B) has reactivity ratios (r1, r2)
    # then (B,A) has reactivity ratios (r2, r1) due to symmetry
    swapped_entry = {
        'MonomerA': row['MonomerB'],        # Swap: MonomerA ↔ MonomerB
        'MonomerB': row['MonomerA'],        # Swap: MonomerB ↔ MonomerA
        'SMILES_A': row['SMILES_B'],        # Swap: SMILES_A ↔ SMILES_B
        'SMILES_B': row['SMILES_A'],        # Swap: SMILES_B ↔ SMILES_A
        'r1': row['r2'],                    # Swap: r1 ↔ r2
        'r2': row['r1'],                    # Swap: r2 ↔ r1
        'entry_type': 'swapped'             # Track entry type for verification
    }
    augmented_data.append(swapped_entry)
    
    # Display the transformation for verification (show first 10 for brevity)
    if index < 10:
        print(f"Row {index+1:3d}:")
        print(f"  Original:")
        print(f"    MonomerA: {row['MonomerA']:25s} | MonomerB: {row['MonomerB']:25s}")
        print(f"    SMILES_A: {row['SMILES_A']:30s} | SMILES_B: {row['SMILES_B']:30s}")
        print(f"    r1: {row['r1']:.4f} | r2: {row['r2']:.4f}")
        print(f"  Swapped:")
        print(f"    MonomerA: {row['MonomerB']:25s} | MonomerB: {row['MonomerA']:25s}")
        print(f"    SMILES_A: {row['SMILES_B']:30s} | SMILES_B: {row['SMILES_A']:30s}")
        print(f"    r1: {row['r2']:.4f} | r2: {row['r1']:.4f}")
        print(f"  {'='*80}")
        print()
    elif index == 10:
        print("  ... (showing first 10 transformations, continuing processing)")
        print()

# Step 3: Create the final augmented dataframe
df_stage4 = pd.DataFrame(augmented_data).reset_index(drop=True)

# Step 4: Comprehensive reporting
original_count = len(df_stage3)
augmented_count = len(df_stage4)
augmentation_factor = augmented_count / original_count

print(f"\nStep 4: Augmentation Results Summary")
print(f"Original reliable pairs: {original_count}")
print(f"After symmetry augmentation: {augmented_count}")
print(f"Augmentation factor: {augmentation_factor:.1f}x")
print(f"New entries added: {augmented_count - original_count}")

# Step 5: Verification of augmentation quality
print(f"\nStep 5: Verification checks...")

# Check 1: Verify we have exactly double the data
assert augmented_count == 2 * original_count, "Augmentation should exactly double the dataset!"
print(f"✓ Dataset size verification: {augmented_count} = 2 × {original_count}")

# Check 2: Verify entry type distribution
entry_type_counts = df_stage4['entry_type'].value_counts()
print(f"✓ Entry type distribution:")
for entry_type, count in entry_type_counts.items():
    print(f"  {entry_type}: {count} entries")

# Check 3: Verify no data loss in critical columns
required_columns = ['MonomerA', 'MonomerB', 'SMILES_A', 'SMILES_B', 'r1', 'r2']
missing_data = df_stage4[required_columns].isnull().sum().sum()
print(f"✓ Data integrity check: {missing_data} missing values (should be 0)")

# Check 4: Detailed verification of ALL 6 columns for first 3 pairs
print(f"\nStep 6: Detailed symmetry verification (first 3 pairs):")
for i in range(0, min(6, len(df_stage4)), 2):  # Check first 3 original-swapped pairs
    orig = df_stage4.iloc[i]
    swap = df_stage4.iloc[i+1]
    
    print(f"\nPair {i//2 + 1} Verification:")
    print(f"  Original entry:")
    print(f"    MonomerA: '{orig['MonomerA']}' | MonomerB: '{orig['MonomerB']}'")
    print(f"    SMILES_A: '{orig['SMILES_A']}' | SMILES_B: '{orig['SMILES_B']}'")
    print(f"    r1: {orig['r1']:.4f} | r2: {orig['r2']:.4f}")
    
    print(f"  Swapped entry:")
    print(f"    MonomerA: '{swap['MonomerA']}' | MonomerB: '{swap['MonomerB']}'")
    print(f"    SMILES_A: '{swap['SMILES_A']}' | SMILES_B: '{swap['SMILES_B']}'")
    print(f"    r1: {swap['r1']:.4f} | r2: {swap['r2']:.4f}")
    
    # Verify ALL symmetry relationships
    name_swap_check = (orig['MonomerA'] == swap['MonomerB'] and orig['MonomerB'] == swap['MonomerA'])
    smiles_swap_check = (orig['SMILES_A'] == swap['SMILES_B'] and orig['SMILES_B'] == swap['SMILES_A'])
    r_swap_check = (abs(orig['r1'] - swap['r2']) < 1e-10 and abs(orig['r2'] - swap['r1']) < 1e-10)
    
    print(f"  Verification results:")
    print(f"    Names swapped correctly: {name_swap_check}")
    print(f"    SMILES swapped correctly: {smiles_swap_check}")
    print(f"    Reactivity ratios swapped correctly: {r_swap_check}")
    print(f"    Overall symmetry verified: {name_swap_check and smiles_swap_check and r_swap_check}")

# Optional: Remove the entry_type column for final dataset (if not needed for analysis)
df_stage4_final = df_stage4.drop('entry_type', axis=1)

print(f"\n{'='*80}")
print(f"STAGE 4 COMPLETE!")
print(f"{'='*80}")
print(f"Final augmented dataset: {len(df_stage4_final)} entries")
print(f"All 6 columns properly swapped:")
print(f"  - MonomerA ↔ MonomerB")
print(f"  - SMILES_A ↔ SMILES_B") 
print(f"  - r1 ↔ r2")
print(f"Ready for feature extraction and modeling stages.")

# Optional: Save the augmented dataset
#df_stage4_final.to_excel('CopolDB_Stage4_Augmented.xlsx', index=False)
#print(f"Augmented dataset saved to: CopolDB_Stage4_Augmented.xlsx")

STAGE 4: DATA AUGMENTATION USING SYMMETRY
Starting with: 1344 reliable unique pairs

DataFrame columns: ['MonomerA', 'MonomerB', 'SMILES_A', 'SMILES_B', 'r1', 'r2']

Processing each monomer pair for symmetry augmentation...
Format: Original -> Swapped
----------------------------------------------------------------------------------------------------
Row   1:
  Original:
    MonomerA: Methacrylic acid          | MonomerB: Vinylidene chloride      
    SMILES_A: CC(=C)C(=O)O                   | SMILES_B: C=C(Cl)Cl                     
    r1: 3.3680 | r2: 0.1540
  Swapped:
    MonomerA: Vinylidene chloride       | MonomerB: Methacrylic acid         
    SMILES_A: C=C(Cl)Cl                      | SMILES_B: CC(=C)C(=O)O                  
    r1: 0.1540 | r2: 3.3680

Row   2:
  Original:
    MonomerA: Methacrylic acid          | MonomerB: 2,3-Dichloro-1-propene   
    SMILES_A: CC(=C)C(=O)O                   | SMILES_B: C=C(CCl)Cl                    
    r1: 4.0000 | r2: 0.0000
  Swapped:


In [5]:
# ===============================================
# STAGE 5: REACTIVITY RATIO RANGE FILTERING
# ===============================================

print("STAGE 5: REACTIVITY RATIO RANGE FILTERING")
print(f"Starting with: {len(df_stage4_final)} augmented entries")

# Define filtering bounds
min_threshold = 0.01
max_threshold = 10.0

print(f"\nFiltering criteria:")
print(f"  Minimum reactivity ratio threshold: {min_threshold}")
print(f"  Maximum reactivity ratio threshold: {max_threshold}")
print(f"  Both r1 and r2 must be within [{min_threshold}, {max_threshold}]")

# Step 1: Analyze current data distribution
print(f"\nStep 1: Analyzing current reactivity ratio distribution...")

r1_stats = df_stage4_final['r1'].describe()
r2_stats = df_stage4_final['r2'].describe()

print(f"r1 statistics:")
print(f"  Min: {r1_stats['min']:.6f}, Max: {r1_stats['max']:.6f}")
print(f"  Mean: {r1_stats['mean']:.4f}, Median: {r1_stats['50%']:.4f}")

print(f"r2 statistics:")
print(f"  Min: {r2_stats['min']:.6f}, Max: {r2_stats['max']:.6f}")
print(f"  Mean: {r2_stats['mean']:.4f}, Median: {r2_stats['50%']:.4f}")

# Step 2: Identify problematic entries
print(f"\nStep 2: Identifying entries outside acceptable range...")

# Find entries with extreme values
r1_too_low = (df_stage4_final['r1'] < min_threshold).sum()
r1_too_high = (df_stage4_final['r1'] > max_threshold).sum()
r2_too_low = (df_stage4_final['r2'] < min_threshold).sum()
r2_too_high = (df_stage4_final['r2'] > max_threshold).sum()

print(f"Problematic entries analysis:")
print(f"  r1 < {min_threshold}: {r1_too_low} entries")
print(f"  r1 > {max_threshold}: {r1_too_high} entries")
print(f"  r2 < {min_threshold}: {r2_too_low} entries")
print(f"  r2 > {max_threshold}: {r2_too_high} entries")

# Find entries where EITHER r1 OR r2 is outside range
extreme_mask = ((df_stage4_final['r1'] < min_threshold) | 
                (df_stage4_final['r1'] > max_threshold) |
                (df_stage4_final['r2'] < min_threshold) | 
                (df_stage4_final['r2'] > max_threshold))

extreme_entries = extreme_mask.sum()
print(f"  Total entries with extreme values: {extreme_entries}")

# Step 3: Apply filtering
print(f"\nStep 3: Applying range filtering...")

# Keep entries where BOTH r1 AND r2 are within acceptable range
valid_mask = ((df_stage4_final['r1'] >= min_threshold) & 
              (df_stage4_final['r1'] <= max_threshold) &
              (df_stage4_final['r2'] >= min_threshold) & 
              (df_stage4_final['r2'] <= max_threshold))

df_stage5 = df_stage4_final[valid_mask].reset_index(drop=True)

# Step 4: Report filtering results
entries_before = len(df_stage4_final)
entries_after = len(df_stage5)
entries_removed = entries_before - entries_after
removal_percentage = (entries_removed / entries_before) * 100

print(f"\nStep 4: Filtering results summary...")
print(f"Entries before filtering: {entries_before}")
print(f"Entries after filtering: {entries_after}")
print(f"Entries removed: {entries_removed} ({removal_percentage:.2f}%)")

# Step 5: Verify logarithmic transformation feasibility
print(f"\nStep 5: Verification for logarithmic transformation...")

# Check if all values are positive (required for log transformation)
min_r1 = df_stage5['r1'].min()
min_r2 = df_stage5['r2'].min()
max_r1 = df_stage5['r1'].max()
max_r2 = df_stage5['r2'].max()

print(f"Final reactivity ratio ranges:")
print(f"  r1: [{min_r1:.4f}, {max_r1:.4f}]")
print(f"  r2: [{min_r2:.4f}, {max_r2:.4f}]")

# Test logarithmic transformation
try:
    import numpy as np
    log_r1_test = np.log(df_stage5['r1'])
    log_r2_test = np.log(df_stage5['r2'])
    print(f"✓ Logarithmic transformation feasible")
    print(f"  log(r1) range: [{log_r1_test.min():.4f}, {log_r1_test.max():.4f}]")
    print(f"  log(r2) range: [{log_r2_test.min():.4f}, {log_r2_test.max():.4f}]")
except Exception as e:
    print(f"✗ Logarithmic transformation failed: {e}")

# Step 6: Final dataset statistics
print(f"\nStep 6: Final dataset characteristics...")
print(f"Final dataset size: {len(df_stage5)} entries")
print(f"Reactivity ratio distribution after filtering:")

final_r1_stats = df_stage5['r1'].describe()
final_r2_stats = df_stage5['r2'].describe()

print(f"  r1 - Mean: {final_r1_stats['mean']:.4f}, Std: {final_r1_stats['std']:.4f}")
print(f"  r2 - Mean: {final_r2_stats['mean']:.4f}, Std: {final_r2_stats['std']:.4f}")

print(f"\n{'='*80}")
print(f"STAGE 5 COMPLETE!")
print(f"{'='*80}")
print(f"Final curated dataset ready for feature extraction:")
print(f"  - {len(df_stage5)} high-quality entries")
print(f"  - Reactivity ratios in practical range [{min_threshold}, {max_threshold}]")
print(f"  - Compatible with logarithmic transformation")
print(f"  - Suitable for machine learning model training")

# Optional: Save the final curated dataset
df_stage5.to_excel('Dataset.xlsx', index=False)
print(f"Final dataset saved to: CopolDB_Stage5_Final.xlsx")

STAGE 5: REACTIVITY RATIO RANGE FILTERING
Starting with: 2688 augmented entries

Filtering criteria:
  Minimum reactivity ratio threshold: 0.01
  Maximum reactivity ratio threshold: 10.0
  Both r1 and r2 must be within [0.01, 10.0]

Step 1: Analyzing current reactivity ratio distribution...
r1 statistics:
  Min: 0.000000, Max: 830.000000
  Mean: 2.1310, Median: 0.4700
r2 statistics:
  Min: 0.000000, Max: 830.000000
  Mean: 2.1310, Median: 0.4700

Step 2: Identifying entries outside acceptable range...
Problematic entries analysis:
  r1 < 0.01: 124 entries
  r1 > 10.0: 92 entries
  r2 < 0.01: 124 entries
  r2 > 10.0: 92 entries
  Total entries with extreme values: 384

Step 3: Applying range filtering...

Step 4: Filtering results summary...
Entries before filtering: 2688
Entries after filtering: 2304
Entries removed: 384 (14.29%)

Step 5: Verification for logarithmic transformation...
Final reactivity ratio ranges:
  r1: [0.0100, 10.0000]
  r2: [0.0100, 10.0000]
✓ Logarithmic transform