In [1]:
from tensorflow import keras
from tensorflow.keras import la

ImportError: cannot import name 'la' from 'tensorflow.keras' (/opt/anaconda3/envs/comp_9814/lib/python3.10/site-packages/keras/_tf_keras/keras/__init__.py)

读compoundA.txt(神经模型中意思是First input)，substrate.txt(神经模型中意思是Second input)，biomass.txt（神经模型中意思是Output），数据都是一列，2000行，分析提供的数据并执行任何必要的预处理。
预处理期间的一些任务可能包括下面所示的任务(a) 确定输入和输出变量的变化范围。(b) 绘制每个变量以观察生物过程的整体行为。(c) 如果检测到异常值，相应地修正数据。例如，因为我们处理的是以克为单位的变量，所以任何值都不应小于零。一个简单的修正方法是用零值替换这些值。(d) 将数据分成两个子集：训练集和验证集。

In [1]:
# preprocessing
"""
Preprocessing script for three 1-column text files:
 - compoundA.txt (First input)
 - substrate.txt (Second input)
 - biomass.txt (Output)

Assumptions:
 - Files are plain text, one numeric value per line (2000 rows expected).
 - Units are grams, so negative values are invalid and will be replaced with 0.

What this script does:
 1. Reads the three files into a pandas DataFrame.
 2. Checks lengths and reports missing/extra rows.
 3. Computes and prints summary statistics (min, max, mean, median, std).
 4. Plots time-series, histograms, and boxplots for each variable and saves PNGs.
 5. Detects negative values and replaces them with zero.
 6. Detects outliers using the IQR method and (optionally) caps them or replaces with median.
 7. Splits data into train/validation sets (default 80/20) with random_state for reproducibility.
 8. Saves processed DataFrames and split files to disk.

Usage:
 1) Put this script in the same folder as compoundA.txt, substrate.txt, biomass.txt
 2) `pip install pandas matplotlib scikit-learn` if you don't have them
 3) Run: `python preprocess_neuro_data.py`

Outputs saved to ./preprocessing_out/:
 - summary.txt            : text summary of statistics and steps
 - plots/*.png            : timeseries, histogram, boxplot for each variable
 - processed_full.csv     : combined dataframe after cleaning
 - train.csv / val.csv    : the train/validation split

You can change options in the SETTINGS section below.
"""

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# ---------------- SETTINGS ----------------
FILE_COMPOUND = "compoundARaw.txt"
FILE_SUBSTRATE = "substrateRaw.txt"
FILE_BIOMASS = "biomassRaw.txt"
OUT_DIR = "preprocessing_out"
PLOTS_DIR = os.path.join(OUT_DIR, "plots")
TRAIN_SIZE = 0.8
RANDOM_STATE = 42
APPLY_OUTLIER_CAP = True  # If True, cap outliers using IQR method; otherwise only report
OUTLIER_CAP_METHOD = "median"  # "median" or "clip"
# ------------------------------------------

os.makedirs(PLOTS_DIR, exist_ok=True)

log_lines = []

def log(s):
    print(s)
    log_lines.append(s)


def read_1col_file(path):
    """Read a single-column numeric file into a pandas Series."""
    try:
        s = pd.read_csv(path, header=None, squeeze=True)
    except Exception as e:
        raise RuntimeError(f"Failed to read {path}: {e}")
    # Ensure numeric
    s = pd.to_numeric(s.iloc[:,0] if s.ndim>1 else s, errors='coerce')
    return s

# 1. Load
log("== Loading files ==")
for f in [FILE_COMPOUND, FILE_SUBSTRATE, FILE_BIOMASS]:
    if not os.path.exists(f):
        log(f"ERROR: required file '{f}' not found in current directory ({os.getcwd()}).")
        log("Place the files in the working directory and re-run this script.")
        with open(os.path.join(OUT_DIR, 'summary.txt'), 'w') as fh:
            fh.write('\n'.join(log_lines))
        sys.exit(1)

s_compound = read_1col_file(FILE_COMPOUND)
s_substrate = read_1col_file(FILE_SUBSTRATE)
s_biomass = read_1col_file(FILE_BIOMASS)

log(f"Loaded: {FILE_COMPOUND} ({len(s_compound)} rows), {FILE_SUBSTRATE} ({len(s_substrate)} rows), {FILE_BIOMASS} ({len(s_biomass)} rows)")

# 2. Align lengths
N = max(len(s_compound), len(s_substrate), len(s_biomass))
log(f"Maximum row count across files = {N}")

# If sizes differ, we'll align by index: shorter series will get NaNs appended
s_compound = s_compound.reset_index(drop=True).reindex(range(N))
s_substrate = s_substrate.reset_index(drop=True).reindex(range(N))
s_biomass = s_biomass.reset_index(drop=True).reindex(range(N))

# Combine into DataFrame
df = pd.DataFrame({
    'compoundA': s_compound,
    'substrate': s_substrate,
    'biomass': s_biomass
})

log("\n== Initial summary (including NaNs) ==")
log(df.describe(include='all').to_string())

# 3. Determine ranges and missing values
ranges = {}
for col in df.columns:
    col_nonnull = df[col].dropna()
    ranges[col] = {
        'count': int(col_nonnull.count()),
        'min': float(col_nonnull.min()) if not col_nonnull.empty else np.nan,
        'max': float(col_nonnull.max()) if not col_nonnull.empty else np.nan,
        'mean': float(col_nonnull.mean()) if not col_nonnull.empty else np.nan,
        'median': float(col_nonnull.median()) if not col_nonnull.empty else np.nan,
        'std': float(col_nonnull.std()) if not col_nonnull.empty else np.nan,
        'n_nan': int(df[col].isna().sum())
    }

log('\nRanges and counts:')
for k,v in ranges.items():
    log(f"{k}: count={v['count']}, n_nan={v['n_nan']}, min={v['min']}, max={v['max']}, mean={v['mean']}, median={v['median']}, std={v['std']}")

# 4. Plot each variable: time-series, histogram, boxplot
log('\n== Generating plots ==')
for col in df.columns:
    col_data = df[col]
    idx = np.arange(len(col_data))

    # Timeseries (line)
    plt.figure(figsize=(10,3))
    plt.plot(idx, col_data.values)
    plt.title(f"{col} - timeseries")
    plt.xlabel('index')
    plt.ylabel(col)
    plt.tight_layout()
    out = os.path.join(PLOTS_DIR, f"{col}_timeseries.png")
    plt.savefig(out)
    plt.close()
    log(f"Saved {out}")

    # Histogram
    plt.figure(figsize=(6,4))
    plt.hist(col_data.dropna().values, bins=50)
    plt.title(f"{col} - histogram")
    plt.xlabel(col)
    plt.ylabel('count')
    plt.tight_layout()
    out = os.path.join(PLOTS_DIR, f"{col}_hist.png")
    plt.savefig(out)
    plt.close()

    # Boxplot
    plt.figure(figsize=(4,3))
    plt.boxplot(col_data.dropna().values, vert=False)
    plt.title(f"{col} - boxplot")
    plt.tight_layout()
    out = os.path.join(PLOTS_DIR, f"{col}_boxplot.png")
    plt.savefig(out)
    plt.close()

log('Plots generated in ' + PLOTS_DIR)

# 5. Fix physically impossible negatives (replace with 0)
log('\n== Fixing negative values (replace with 0) ==')
neg_counts = {}
for col in df.columns:
    neg = (df[col] < 0).sum()
    neg_counts[col] = int(neg)
    if neg>0:
        log(f"{col}: found {neg} negative values -> replacing with 0")
        df.loc[df[col] < 0, col] = 0

if all(v==0 for v in neg_counts.values()):
    log('No negative values found.')

# 6. Outlier detection with IQR
log('\n== Outlier detection (IQR method) ==')
outlier_info = {}
for col in df.columns:
    s = df[col].dropna()
    Q1 = s.quantile(0.25)
    Q3 = s.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    mask_out = (s < lower) | (s > upper)
    n_out = int(mask_out.sum())
    outlier_info[col] = dict(Q1=float(Q1), Q3=float(Q3), IQR=float(IQR), lower=float(lower), upper=float(upper), n_out=n_out)
    log(f"{col}: Q1={Q1}, Q3={Q3}, IQR={IQR}, lower={lower}, upper={upper}, n_out={n_out}")

# Optionally cap or replace outliers
if APPLY_OUTLIER_CAP:
    log('\nApplying outlier correction...')
    for col in df.columns:
        info = outlier_info[col]
        lower = info['lower']
        upper = info['upper']
        if OUTLIER_CAP_METHOD == 'median':
            median = df[col].median()
            n_replaced = ((df[col] < lower) | (df[col] > upper)).sum()
            df.loc[(df[col] < lower) | (df[col] > upper), col] = median
            log(f"{col}: replaced {int(n_replaced)} outliers with median={median}")
        elif OUTLIER_CAP_METHOD == 'clip':
            before = df[col].copy()
            df[col] = df[col].clip(lower=lower, upper=upper)
            n_changed = (before != df[col]).sum()
            log(f"{col}: clipped {int(n_changed)} values to [{lower}, {upper}]")
        else:
            log(f"Unknown OUTLIER_CAP_METHOD='{OUTLIER_CAP_METHOD}' - skipping outlier correction")

# 7. Final summary after cleaning
log('\n== Final summary after cleaning ==')
for col in df.columns:
    col_nonnull = df[col].dropna()
    log(f"{col}: count={int(col_nonnull.count())}, min={float(col_nonnull.min())}, max={float(col_nonnull.max())}, mean={float(col_nonnull.mean())}, median={float(col_nonnull.median())}, std={float(col_nonnull.std())}")

# 8. Split into train/validation
log('\n== Splitting into train and validation sets ==')
# We'll split rows — ensure we drop rows with any NaNs first or optionally impute
n_rows_before = len(df)
rows_with_nan = df.isna().any(axis=1).sum()
log(f"Rows total = {n_rows_before}, rows with any NaN = {rows_with_nan}")

# Drop rows with NaNs for simplicity (report to user)
df_clean = df.dropna().reset_index(drop=True)
log(f"After dropping NaN rows: {len(df_clean)} rows remain")

X = df_clean[['compoundA', 'substrate']]
y = df_clean['biomass']

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=TRAIN_SIZE, random_state=RANDOM_STATE)
train = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
val = pd.concat([X_val, y_val], axis=1).reset_index(drop=True)
log(f"Train rows: {len(train)}, Validation rows: {len(val)}")

# 9. Save outputs
os.makedirs(OUT_DIR, exist_ok=True)
processed_csv = os.path.join(OUT_DIR, 'processed_full.csv')
train_csv = os.path.join(OUT_DIR, 'train.csv')
val_csv = os.path.join(OUT_DIR, 'val.csv')

df.to_csv(processed_csv, index=False)
train.to_csv(train_csv, index=False)
val.to_csv(val_csv, index=False)

log(f"Saved processed data to {processed_csv}, {train_csv}, {val_csv}")

# Save textual summary
with open(os.path.join(OUT_DIR, 'summary.txt'), 'w') as fh:
    fh.write('\n'.join(log_lines))

log('\nPreprocessing complete. Review the plots and summary.txt in the preprocessing_out folder.')

# Quick tips for the user on next steps
log('\nNext steps suggestions:')
log('- Inspect the timeseries plots to understand temporal patterns or sensor drift.')
log('- If you believe NaNs represent sensor dropout rather than missing rows, consider interpolation instead of dropping.')
log('- For modeling, consider scaling features (StandardScaler or MinMaxScaler) depending on the chosen algorithm.')
log('- If you want different outlier handling (e.g., winsorize, robust scaling), change APPLY_OUTLIER_CAP/OUTLIER_CAP_METHOD at the top and re-run.')


== Loading files ==


RuntimeError: Failed to read compoundARaw.txt: read_csv() got an unexpected keyword argument 'squeeze'