In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

df_list = []
filename_list = []
for filename in os.listdir("data/cleaned_data"):
    df = pd.read_csv(f"data/cleaned_data/{filename}")
    filename_list.append(filename)
    df_list.append(df)
print(filename_list)

# Extract Feature: Peaks & Troughs

In [None]:
from scipy.signal import find_peaks
import matplotlib.pyplot as plt

def plot_peaks_troughs(df, peaks=[], troughs=[]):
    plt.figure(figsize=(4,3))         # Optional: set figure size
    # Plot the resistance curve
    plt.plot(df["depth"], df["resistance"])
    # Plot the peaks
    plt.plot(df["depth"].iloc[peaks], df["resistance"].iloc[peaks], 'ro', label="Peaks")
    # Plot the peaks
    plt.plot(df["depth"].iloc[troughs], df["resistance"].iloc[troughs], 'bo', label="Troughs")
    plt.xlabel("Depth")
    plt.ylabel("Resistance")
    plt.title("Depth vs Resistance with Peaks")
    plt.legend()
    plt.show()

def remove_peak_if_close_to_end(peaks, df):
    len_depth = len(df['depth'])
    cleaned_peaks = []
    for peak in peaks:
        if peak < len_depth * 0.6: cleaned_peaks.append(int(peak))
    return cleaned_peaks

def make_peak_bool(peaks):
    if len(peaks) > 0: return 1
    else: return 0

def get_yield_force(df):
    # Find peaks in the resistance data
    # distance -> req minimal horiz dis in samples between neighboring peaks 
    # # this is an excellent combination
    # peaks, _ = find_peaks(df["resistance"], height=0, distance=10, width=5)
    # peaks, _ = find_peaks(df["resistance"], distance=len(df["resistance"])*0.5, width=30)
    peaks, _ = find_peaks(df["resistance"], distance=100, width=20)
    
    peaks = remove_peak_if_close_to_end(peaks, df)
    troughs, _ = find_peaks(-df["resistance"], distance=200, width=20)    
    # plot_peaks_troughs(df, peaks, troughs)
    return peaks, troughs


# Choose Features

In [None]:

def extract_simple_features(df):
    res = df["resistance"]
    dep = df["depth"]
    peaks = find_peaks(df["resistance"], distance=30)[0]
    return pd.DataFrame([{
        # "res_mean": res.mean(),
        # "peaks_value": peaks[0] if len(peaks) > 0 else 0,
        "overall_slope": res.max() / dep.max(),
        "overall_depth": dep.max(),
        "overall_resistance": res.max(),
        # "num_peaks": len(peaks),
        # "heterogeneity": heterogeneity(df, surface_fraction=0.2)
    }])

# shape (n,m) where n is number of df and m is extracted feaetures
representation_list = []
for i, df in enumerate(df_list):
    print(filename_list[i])
    extracted_simple_features = extract_simple_features(df)
    representation_list.append(extracted_simple_features)
print(f"example: {representation_list[0]}")


# Correlation Matrix

In [None]:
representation_df  = pd.concat(representation_list, axis=0, ignore_index=True)
corr_matrix = representation_df.corr()
print(corr_matrix)

# Visualize Extracted Features Histograms

In [None]:
# representation_df  = pd.concat(representation_list, axis=0, ignore_index=True)
# # print(representation_df)
def plot_feature_dist(representation_df):
    for col in representation_df.columns:
        plt.figure(figsize=(4,3))
        plt.hist(representation_df[col], bins=60, density=True)
        plt.title(f"Extracted Feature {col} Global Histogram")
        plt.show()
plot_feature_dist(representation_df)

# Scale Features

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np

def transform_features(df):
    df = df.copy()  # avoid modifying original
    # Apply cube root to 'slope'
    if "overall_slope" in df.columns: df['overall_slope'] = np.log(df['overall_slope'])
    if "peaks" in df.columns: df['peaks'] = np.log(df['peaks'] + 1)
    if "peaks_value" in df.columns: df["peaks_value"] = np.log(df['peaks_value'])
    # if "yield_force" in df.columns: df['yield_force'] = df["yield_force"]
    # if "skew" in df.columns: df["skew"] = df["skew"] ** 1.1
    # if "kurtosis" in df.columns: df["kurtosis"] = np.sqrt(df["kurtosis"]) 
    # Standardize
    scaler = StandardScaler()
    df[df.columns] = scaler.fit_transform(df[df.columns])
    return df

scaled_representations = transform_features(representation_df)
print(scaled_representations[:5])
plot_feature_dist(scaled_representations)


# Save representation

In [None]:
scaled_representations.to_csv(f"data/features.csv", index=False)