# To generate according to expert

For my validation data, I like to generate 10 rocks for each rock type, each case proposed by geologist.

The code pipeline:
1. Loads individual spectra from files
2. Predefined list of specified minerals for each rock
3. Combines all the spectrums into a numpy array
4. Saves each rock's spectra as a separate NPY file
5. Prints a summary of the mineral composition for each rock

specified_minerals = ['Quartz', 'Annite', 'Eastonite', 'Phlogopite', 'Muscovite',
 'Margarite', 'Anorthite', 'Orthoclase', 'Sanidine', 'Albite']

In [None]:
import numpy as np
import os
from pathlib import Path
import random
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy import stats, interpolate

def read_spectrum(file_path):
    """
    Read spectrum data from a text file
    Returns wavelength and intensity arrays
    """
    try:
        wavelength, intensity = [], []
        with open(file_path, 'r') as file:
            for line in file:
                try:
                    w, i = map(float, line.strip().split(','))
                    wavelength.append(w)
                    intensity.append(i)
                except ValueError:
                    continue
        return np.array(wavelength), np.array(intensity)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return np.array([]), np.array([])

def interpolate_spectrum(wavelength, intensity, common_wavelengths):
    """Interpolate spectrum to common wavelength points."""
    if len(wavelength) < 2:  # Need at least 2 points for interpolation
        return None
    try:
        f = interpolate.interp1d(wavelength, intensity, kind='linear',
                                bounds_error=False, fill_value=0)
        return f(common_wavelengths)
    except Exception as e:
        print(f"Error during interpolation: {e}")
        return None

def find_common_wavelength_range(spectra_dict):
    """Find common wavelength range and create uniform wavelength points."""
    min_wavelength = float('inf')
    max_wavelength = float('-inf')
    
    # Find the overall min and max wavelengths
    for wavelengths, _ in spectra_dict:
        if len(wavelengths) > 0:
            min_wavelength = min(min_wavelength, np.min(wavelengths))
            max_wavelength = max(max_wavelength, np.max(wavelengths))
    
    if min_wavelength == float('inf') or max_wavelength == float('-inf'):
        return None
    
    # Create uniform wavelength points
    num_points = 1000  # Fixed number of points
    return np.linspace(min_wavelength, max_wavelength, num_points)

def generate_synthetic_sample_pca(real_data, n_samples=1, min_components=2):
    """Generate synthetic samples using PCA-based method"""
    n_samples_real, n_features = real_data.shape
    n_components = min(n_samples_real - 1, 10)
    n_components = max(n_components, min_components)
    
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(real_data)
    
    pca = PCA(n_components=n_components)
    data_pca = pca.fit_transform(data_scaled)
    
    synthetic_samples_pca = []
    for dim in range(n_components):
        kde = stats.gaussian_kde(data_pca[:, dim])
        synthetic_samples_pca.append(kde.resample(n_samples)[0])
    
    synthetic_samples_pca = np.array(synthetic_samples_pca).T
    synthetic_samples_scaled = pca.inverse_transform(synthetic_samples_pca)
    synthetic_samples = scaler.inverse_transform(synthetic_samples_scaled)
    synthetic_samples = np.maximum(synthetic_samples, 0)
    
    return synthetic_samples

def generate_synthetic_sample_small(real_data, n_samples=1):
    """Generate synthetic samples for small datasets"""
    synthetic_samples = []
    base_sample = real_data[0]
    
    for _ in range(n_samples):
        segment_size = len(base_sample) // 10
        synthetic = np.zeros_like(base_sample)
        
        for i in range(0, len(base_sample), segment_size):
            segment = base_sample[i:i+segment_size]
            noise_scale = 0.05 * np.std(segment) if np.std(segment) > 0 else 0.05 * np.mean(segment)
            synthetic[i:i+segment_size] = segment + np.random.normal(0, noise_scale, len(segment))
        
        synthetic = np.maximum(synthetic, 0)
        synthetic_samples.append(synthetic)
    
    return np.array(synthetic_samples)

def generate_synthetic_samples(real_data, n_samples, min_samples_for_pca=3):
    """Generate synthetic samples using appropriate method"""
    if len(real_data) < min_samples_for_pca:
        return generate_synthetic_sample_small(real_data, n_samples)
    else:
        return generate_synthetic_sample_pca(real_data, n_samples)

def load_and_preprocess_spectra(folder_path, specified_minerals):
    """Load all spectra and prepare for processing"""
    all_spectra = []
    spectra_by_mineral = {mineral: [] for mineral in specified_minerals}
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            mineral_name = filename.split('__')[0]
            if mineral_name in specified_minerals:
                wavelength, intensity = read_spectrum(Path(folder_path) / filename)
                if len(wavelength) > 0:
                    all_spectra.append((wavelength, intensity))
                    spectra_by_mineral[mineral_name].append(filename)
    
    # Find common wavelength range
    common_wavelengths = find_common_wavelength_range(all_spectra)
    if common_wavelengths is None:
        raise ValueError("Could not determine common wavelength range")
    
    return common_wavelengths, spectra_by_mineral

def load_mineral_spectrum(filename, folder_path, common_wavelengths, generate_synthetic=True):
    """Load and process a single mineral spectrum"""
    wavelength, intensity = read_spectrum(Path(folder_path) / filename)
    if len(wavelength) == 0:
        return None
    
    interpolated = interpolate_spectrum(wavelength, intensity, common_wavelengths)
    if interpolated is None:
        return None
    
    if generate_synthetic:
        synthetic = generate_synthetic_samples(interpolated.reshape(1, -1), 1)
        return synthetic[0]
    
    return interpolated

def generate_validation_rocks(
    selected_minerals,
    spectra_dir,
    num_rocks=10,
    spectra_per_rock=10,
    specified_mineral_count=8,
    rruff_dir=None,
    output_dir="validation_rocks",
    generate_synthetic=True
):
    """Generate synthetic validation rocks"""
    if rruff_dir is None:
        rruff_dir = spectra_dir
        
    Path(output_dir).mkdir(exist_ok=True)
    
    # Load and preprocess all spectra
    print("Loading and preprocessing spectra...")
    all_minerals = list(set([mineral for sequence in selected_minerals.values() for mineral in sequence]))
    common_wavelengths, spectra_by_mineral = load_and_preprocess_spectra(spectra_dir, all_minerals)
 
    # Verify available spectra
    for mineral, spectra in spectra_by_mineral.items():
        if not spectra:
            print(f"Warning: No spectra found for {mineral}")
    
    for rock_idx, minerals in selected_minerals.items():
        rock_spectra = []
        mineral_composition = []
        spectrum_files = []  # Store filenames for reference
        
        # Add specified mineral spectra
        # selected_minerals = random.choices(specified_minerals, k=specified_mineral_count)
        for mineral in minerals:
            if spectra_by_mineral[mineral]:
                spectrum_file = random.choice(spectra_by_mineral[mineral])
                spectrum = load_mineral_spectrum(
                    spectrum_file,
                    spectra_dir,
                    common_wavelengths,
                    generate_synthetic=generate_synthetic
                )
                
                if spectrum is not None:
                    rock_spectra.append(spectrum)
                    mineral_composition.append(mineral)
                    spectrum_files.append(spectrum_file)
        
        # Add RRUFF spectra
        rruff_files = [f for f in os.listdir(rruff_dir) if f.endswith('.txt')]
        rruff_count = spectra_per_rock - len(rock_spectra)
        
        for _ in range(rruff_count):
            spectrum_file = random.choice(rruff_files)
            mineral_name = spectrum_file.split('__')[0]
            
            spectrum = load_mineral_spectrum(
                spectrum_file,
                rruff_dir,
                common_wavelengths,
                generate_synthetic=generate_synthetic
            )
            
            if spectrum is not None:
                rock_spectra.append(spectrum)
                mineral_composition.append(f"RRUFF_{mineral_name}")
                spectrum_files.append(spectrum_file)
        
        if rock_spectra:
            # Save spectra data
            rock_spectra_array = np.array(rock_spectra)
            output_base = Path(output_dir) / f"rock_{rock_idx+1:02d}"
            np.save(f"{output_base}_wavelengths.npy", common_wavelengths)
            np.save(f"{output_base}_intensities.npy", rock_spectra_array)
            
            # Save composition row by row, matching the order in the intensities array
            with open(f"{output_base}_composition.txt", 'w') as f:
                # Write header
                f.write("Row\tMineral\tFilename\n")
                # Write each row's information
                for idx, (mineral, filename) in enumerate(zip(mineral_composition, spectrum_files)):
                    f.write(f"{idx}\t{mineral}\t{filename}\n")
            
            # Print summary
            print(f"\nRock {rock_idx+1} Composition:")
            print("-" * 50)
            print("Row\tMineral\tFilename")
            print("-" * 50)
            for idx, (mineral, filename) in enumerate(zip(mineral_composition, spectrum_files)):
                print(f"{idx}\t{mineral}\t{filename}")
            print("-" * 50)
            
            # Also print mineral counts for overview
            print("\nMineral Counts:")
            for mineral in sorted(set(mineral_composition)):
                count = mineral_composition.count(mineral)
                print(f"{mineral}: {count} spectra")
        else:
            print(f"\nWarning: Could not generate Rock {rock_idx+1} - no valid spectra")

In [10]:
if __name__ == "__main__":

    # refer to rock-samples.csv
    selected_minerals = {
        # Group 1: Granite (0-9)
        0: ['Albite', 'Anorthite', 'Quartz', 'Quartz', 'Annite', 'Muscovite', 'Quartz', 'Albite', 'Annite', 'Orthoclase'],
        1: ['Albite', 'Quartz', 'Annite', 'Annite', 'Muscovite', 'Orthoclase', 'Orthoclase', 'Quartz', 'Quartz', 'Anorthite'],
        2: ['Margarite', 'Quartz', 'Quartz', 'Margarite', 'Orthoclase', 'Margarite', 'Anorthite', 'Quartz', 'Annite', 'Quartz'],
        3: ['Orthoclase', 'Muscovite', 'Annite', 'Orthoclase', 'Albite', 'Anorthite', 'Muscovite', 'Annite', 'Quartz', 'Phlogopite'],
        4: ['Annite', 'Phlogopite', 'Sanidine', 'Albite', 'Sanidine', 'Quartz', 'Annite', 'Sanidine', 'Sanidine', 'Sanidine'],
        5: ['Annite', 'Quartz', 'Quartz', 'Annite', 'Albite', 'Anorthite', 'Orthoclase', 'Albite', 'Annite', 'Muscovite'],
        6: ['Quartz', 'Annite', 'Muscovite', 'Quartz', 'Anorthite', 'Orthoclase', 'Quartz', 'Albite', 'Quartz', 'Annite'],
        7: ['Orthoclase', 'Eastonite', 'Anorthite', 'Albite', 'Eastonite', 'Phlogopite', 'Quartz', 'Quartz', 'Eastonite', 'Eastonite'],
        8: ['Anorthite', 'Quartz', 'Staurolite', 'Garnet', 'Orthoclase', 'Phlogopite', 'Muscovite', 'Albite', 'Quartz', 'Annite'],
        9: ['Garnet', 'Annite', 'Quartz', 'Anorthite', 'Albite', 'Orthoclase', 'Muscovite', 'Garnet', 'Quartz', 'Annite'],

        # Group 2: Sandstone (10-19)
        10: ['Quartz', 'Calcite', 'Orthoclase', 'Albite', 'Anorthite', 'Pyrite', 'Annite', 'Phlogopite', 'Muscovite', 'Tourmaline'],
        11: ['Albite', 'Orthoclase', 'Anorthite', 'Tourmaline', 'Rutile', 'Muscovite', 'Quartz', 'Calcite', 'Annite', 'Anorthite'],
        12: ['Sanidine', 'Albite', 'Sanidine', 'Quartz', 'Calcite', 'Quartz', 'Albite', 'Anorthite', 'Margarite', 'Annite'],
        13: ['Pyrite', 'Orthoclase', 'Quartz', 'Calcite', 'Quartz', 'Albite', 'Anorthite', 'Tourmaline', 'Rutile', 'Annite'],
        14: ['Albite', 'Anorthite', 'Anorthite', 'Phlogopite', 'Muscovite', 'Annite', 'Quartz', 'Orthoclase', 'Quartz', 'Quartz'],
        15: ['Orthoclase', 'Albite', 'Anorthite', 'Annite', 'Muscovite', 'Quartz', 'Quartz', 'Quartz', 'Tourmaline', 'Annite'],
        16: ['Quartz', 'Anorthite', 'Orthoclase', 'Orthoclase', 'Phlogopite', 'Epidote', 'Quartz', 'Omphacite', 'Quartz', 'Annite'],
        17: ['Albite', 'Phlogopite', 'Orthoclase', 'Albite', 'Muscovite', 'Annite', 'Calcite', 'Quartz', 'Albite', 'Pyrite'],
        18: ['Eastonite', 'Albite', 'Quartz', 'Quartz', 'Albite', 'Eastonite', 'Muscovite', 'Quartz', 'Calcite', 'Annite'],
        19: ['Quartz', 'Muscovite', 'Tourmaline', 'Quartz', 'Calcite', 'Albite', 'Tourmaline', 'Annite', 'Muscovite', 'Orthoclase'],

        # Group 3: Limestone (20-29)
        20: ['Calcite', 'Quartz', 'Calcite', 'Calcite', 'Calcite', 'Anorthite', 'Albite', 'Calcite', 'Calcite', 'Calcite'],
        21: ['Andalusite', 'Calcite', 'Calcite', 'Albite', 'Orthoclase', 'Calcite', 'Calcite', 'Kyanite', 'Pyrite', 'Calcite'],
        22: ['Quartz', 'Calcite', 'Calcite', 'Calcite', 'Calcite', 'Calcite', 'Calcite', 'Quartz', 'Calcite', 'Calcite'],
        23: ['Dolomite', 'Calcite', 'Calcite', 'Epidote', 'Rhodochrosite', 'Calcite', 'Calcite', 'Calcite', 'Calcite', 'Calcite'],
        24: ['Calcite', 'Calcite', 'Calcite', 'Calcite', 'Calcite', 'Calcite', 'Calcite', 'Calcite', 'Calcite', 'Calcite'],
        25: ['Quartz', 'Calcite', 'Quartz', 'Albite', 'Calcite', 'Pyrite', 'Quartz', 'Calcite', 'Calcite', 'Calcite'],
        26: ['Calcite', 'Quartz', 'Calcite', 'Calcite', 'Calcite', 'Calcite', 'Calcite', 'Quartz', 'Calcite', 'Quartz'],
        27: ['Sanidine', 'Quartz', 'Sanidine', 'Calcite', 'Calcite', 'Sanidine', 'Calcite', 'Calcite', 'Calcite', 'Calcite'],
        28: ['Calcite', 'Calcite', 'Calcite', 'Albite', 'Calcite', 'Calcite', 'Calcite', 'Calcite', 'Calcite', 'Calcite'],
        29: ['Calcite', 'Calcite', 'Calcite', 'Calcite', 'Calcite', 'Calcite', 'Calcite', 'Calcite', 'Calcite', 'Calcite']
            
    }
    
    generate_validation_rocks(
        selected_minerals,
        spectra_dir="/home/iyeszin/Desktop/my git/minerals-rocks-relationship/all about rruff/excellent_unoriented",
        num_rocks=1,
        spectra_per_rock=10,
        specified_mineral_count=10,
        rruff_dir="/home/iyeszin/Desktop/my git/minerals-rocks-relationship/all about rruff/excellent_unoriented",
        output_dir="validation_rocks",
        generate_synthetic=True
    )

Loading and preprocessing spectra...

Rock 1 Composition:
--------------------------------------------------
Row	Mineral	Filename
--------------------------------------------------
0	Albite	Albite__R040068__Raman__532__0__unoriented__Raman_Data_Processed__15181.txt
1	Anorthite	Anorthite__R060193__Raman__532__0__unoriented__Raman_Data_Processed__33260.txt
2	Quartz	Quartz__R150074__Raman__780__0__unoriented__Raman_Data_Processed__38603.txt
3	Quartz	Quartz__R110108__Raman__780__0__unoriented__Raman_Data_Processed__36253.txt
4	Annite	Annite__R060211__Raman__532__0__unoriented__Raman_Data_Processed__39791.txt
5	Muscovite	Muscovite__R050198__Raman__532__0__unoriented__Raman_Data_Processed__30430.txt
6	Quartz	Quartz__X080015__Raman__532__0__unoriented__Raman_Data_Processed__31562.txt
7	Albite	Albite__R060054__Raman__780__0__unoriented__Raman_Data_Processed__29474.txt
8	Annite	Annite__R060211__Raman__532__0__unoriented__Raman_Data_Processed__39791.txt
9	Orthoclase	Orthoclase__R050367__Raman__5