In [3]:
# ---- Clean start ----
try:
    del pandas
except:
    pass

In [4]:
# ---- Imports ----
import os
import pandas as pd
import glob
from datetime import datetime

In [5]:
# ---- Set up folders ----
input_folder = '.'  # You can change this to another folder path
output_folder = './processed/'
os.makedirs(output_folder, exist_ok=True)

In [6]:
# ---- Log file path ----
log_file_path = os.path.join(output_folder, 'processing_log.txt')

In [7]:
# ---- Serial-to-Well Name Mapping ----
serial_to_well = {
    '841515': 'Sample Data',
    # Add more serial numbers and well names as needed
}

In [8]:
# ---- Dry run mode ----
dry_run = False  # Set to True to preview files without modifying anything

In [9]:
# ---- Track processed files to avoid duplicates ----
if os.path.exists(log_file_path):
    with open(log_file_path, 'r') as f:
        processed_files = set(line.strip() for line in f if line.strip())
else:
    processed_files = set()

In [11]:
# ---- Stats for dashboard ----
processed_count = 0
skipped_count = 0
errors = []

In [12]:
# ---- Per-well master dataframes cache ----
master_dataframes = {}

In [None]:
# ---- Process files ----
for filepath in filepaths:
    try:
        filename = os.path.basename(filepath)

        if filename in processed_files:
            print(f"⏭️ Skipping already processed file: {filename}")
            skipped_count += 1
            continue

        print(f"\n🧪 Processing: {filename}")
        df = pd.read_csv(filepath)

        # Drop empty rows and columns
        df.dropna(axis=0, how='all', inplace=True)
        df.dropna(axis=1, how='all', inplace=True)

        # Clean headers
        df.columns = df.columns.str.strip()
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

        # Convert wt% to ppm for known elements
        for element in ['Ca', 'K', 'Fe']:
            if element in df.columns and df[element].max() < 1:
                df[element + '_ppm'] = df[element] * 10000

        # Replace zero in common ratio denominators
        for col in ['Al', 'Ti']:
            if col in df.columns:
                df[col] = df[col].replace(0, 0.0001)

        # Trim extreme outliers
        for col in df.select_dtypes(include=['float64', 'int64']).columns:
            q_low = df[col].quantile(0.01)
            q_hi = df[col].quantile(0.99)
            df[col] = df[col].clip(lower=q_low, upper=q_hi)

        # Extract well name from serial
        serial = filename.split('-')[1]
        well_name = serial_to_well.get(serial, 'UNKNOWN_WELL')

        # Add source file label
        df['source_file'] = filename

        # Cache and concatenate into master dataframe in memory
        if well_name not in master_dataframes:
            master_file_path = os.path.join(output_folder, f'master_{well_name}.csv')
            if os.path.exists(master_file_path):
                master_dataframes[well_name] = pd.read_csv(master_file_path)
            else:
                master_dataframes[well_name] = pd.DataFrame()

        master_dataframes[well_name] = pd.concat([master_dataframes[well_name], df], ignore_index=True)
        processed_count += 1

        if not dry_run:
            # Save cleaned individual file
            output_file = os.path.join(output_folder, f'processed_{filename}')
            df.to_csv(output_file, index=False)
            print(f"✅ Saved cleaned file to: {output_file}")

            # Log the processed file
            with open(log_file_path, 'a') as log:
                log.write(filename + '\n')

    except Exception as e:
        print(f"❌ Error processing {filename}: {e}")
        errors.append((filename, str(e)))

# ---- Save updated master files ----
if not dry_run:
    for well_name, df in master_dataframes.items():
        df = df.copy()  # De-fragment for performance
        master_path = os.path.join(output_folder, f'master_{well_name}.csv')
        df.to_csv(master_path, index=False)
        print(f"📁 Updated master sheet for: {well_name}")

In [12]:
# ---- Per-well master dataframes cache ----
master_dataframes = {}

In [None]:
# ---- QA/QC Summary Dashboard ----
print("\n🧾 Run Summary")
print("-" * 30)
print(f"Total files found: {len(filepaths)}")
print(f"Files processed: {processed_count}")
print(f"Files skipped: {skipped_count}")
print(f"Files errored: {len(errors)}")
if errors:
    for fname, msg in errors:
        print(f"❌ {fname}: {msg}")