# New dataset with one hour shift and the new 32 features

In [1]:
import numpy as np
import pandas as pd
from scipy import stats, fft

def load_shifted_data(file_path):
    """Load shifted dataset and return a DataFrame."""
    try:
        df = pd.read_csv(file_path)
        print(f"Loaded data from {file_path}, shape: {df.shape}")
        return df
    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        return None
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None

def calculate_all_features(activity_windows):
    """Calculate 32 features for each activity window."""
    feature_names = [
        'Minimum', 'Maximum', 'Mean', 'RMS', 'STD',
        'MeanSTD6h', 'STDMean6h', 'STDSD', 'RMSSD', 'Mode',
        'Q10', 'Q90', 'Q25', 'Q50', 'Q75',
        'Skewness', 'Kurtosis',
        *[f'Autocorr{i}' for i in range(1, 12)],
        *[f'h{i}' for i in range(1, 5)]
    ]
    features = []

    for window in activity_windows:
        try:
            window = np.array(window, dtype=float)
            if len(window) == 0 or np.all(np.isnan(window)):
                print(f"Warning: Empty or invalid window, returning NaN features.")
                features.append({f: np.nan for f in feature_names})
                continue

            feature_dict = {}
            # Time-Domain Features
            feature_dict['Minimum'] = np.min(window)
            feature_dict['Maximum'] = np.max(window)
            feature_dict['Mean'] = np.mean(window)
            feature_dict['RMS'] = np.sqrt(np.mean(np.square(window)))
            feature_dict['STD'] = np.std(window, ddof=1) if len(window) > 1 else np.nan

            # 6-hour windows
            six_h_windows = [window[i*6:(i+1)*6] for i in range(4) if len(window[i*6:(i+1)*6]) > 0]
            stds_6h = [np.std(w, ddof=1) if len(w) > 1 else np.nan for w in six_h_windows]
            means_6h = [np.mean(w) if len(w) > 0 else np.nan for w in six_h_windows]
            feature_dict['MeanSTD6h'] = np.nanmean(stds_6h) if stds_6h else np.nan
            feature_dict['STDMean6h'] = np.std(means_6h, ddof=1) if len(means_6h) > 1 else np.nan

            # Successive differences
            diffs = np.diff(window)
            feature_dict['STDSD'] = np.std(diffs, ddof=1) if len(diffs) > 1 else np.nan
            feature_dict['RMSSD'] = np.sqrt(np.mean(np.square(diffs))) if len(diffs) > 0 else np.nan

            # Distribution features
            feature_dict['Mode'] = stats.mode(window, keepdims=True)[0][0] if len(window) > 0 else np.nan
            feature_dict['Q10'] = np.percentile(window, 10) if len(window) > 0 else np.nan
            feature_dict['Q90'] = np.percentile(window, 90) if len(window) > 0 else np.nan
            feature_dict['Q25'] = np.percentile(window, 25) if len(window) > 0 else np.nan
            feature_dict['Q50'] = np.percentile(window, 50) if len(window) > 0 else np.nan
            feature_dict['Q75'] = np.percentile(window, 75) if len(window) > 0 else np.nan
            feature_dict['Skewness'] = stats.skew(window) if len(window) > 2 else np.nan
            feature_dict['Kurtosis'] = stats.kurtosis(window) if len(window) > 3 else np.nan

            # Autocorrelation (lags 1 to 11)
            for lag in range(1, 12):
                if len(window) > lag and not np.all(np.isnan(window[:-lag])) and not np.all(np.isnan(window[lag:])):
                    corr = np.corrcoef(window[:-lag], window[lag:])[0, 1]
                else:
                    corr = np.nan
                feature_dict[f'Autocorr{lag}'] = corr

            # Frequency-Domain Features
            fft_result = np.abs(fft.fft(window)) if len(window) > 0 else np.array([])
            harmonics = fft_result[1:5] if len(fft_result) > 4 else [np.nan] * 4
            for i, h in enumerate(harmonics, 1):
                feature_dict[f'h{i}'] = h

            features.append(feature_dict)

        except Exception as e:
            print(f"Error processing window: {e}")
            features.append({f: np.nan for f in feature_names})

    return pd.DataFrame(features)

def combine_features(shifted_df, features_df):
    """Combine features with original metadata and labels."""
    shifted_df_cleaned = shifted_df.drop(columns=['activity_window'])
    features_columns = ['cow', 'start_time', 'end_time', 'duration_hours']
    label_columns = [col for col in shifted_df_cleaned.columns if col not in features_columns]

    final_df = pd.concat([
        shifted_df_cleaned[features_columns].reset_index(drop=True),
        features_df.reset_index(drop=True),
        shifted_df_cleaned[label_columns].reset_index(drop=True)
    ], axis=1)
    return final_df

def save_output(df, output_path):
    """Save DataFrame to CSV."""
    try:
        df.to_csv(output_path, index=False)
        print(f"CSV file saved to {output_path}")
    except Exception as e:
        print(f"Error saving to {output_path}: {str(e)}")

# List of input files (output from Transformation.ipynb)
input_files = [
    r"C:/Users/lamia/Desktop/datasets/dataset1_knn.csv",
  
]

# Process each input file
for file in input_files:
    print(f"\n📊 Processing file: {file}")
    shifted_df = load_shifted_data(file)
    if shifted_df is not None:
        # Ensure activity_window is a list of lists
        shifted_df['activity_window'] = shifted_df['activity_window'].apply(eval) if shifted_df['activity_window'].dtype == 'object' else shifted_df['activity_window']
        features_df = calculate_all_features(shifted_df['activity_window'])
        final_df = combine_features(shifted_df, features_df)
        # output_path = f"32features_1hour_shift_{file.split('_')[-1]}"
        output_path = r"C:/Users/lamia/Desktop/datasets/32features_dataset1.csv"
        save_output(final_df, output_path)
        print(f"📊 Final dataset shape: {final_df.shape}")
        print("\n🔍 First 3 rows of the final dataset:")
        display(final_df.head(3))


  from pandas.core import (



📊 Processing file: C:/Users/lamia/Desktop/datasets/dataset1_knn.csv
Loaded data from C:/Users/lamia/Desktop/datasets/dataset1_knn.csv, shape: (105813, 15)


KeyError: 'activity_window'

In [None]:
from scipy import stats, fft

# 1. Define the feature calculation function (unchanged)
def calculate_features(window):
    """Calculate all 32 features for a 24-hour window"""
    features = {}
    window = np.array(window)

    # Time-Domain Features (1-17)
    features['Minimum'] = np.min(window)
    features['Maximum'] = np.max(window)
    features['Mean'] = np.mean(window)
    features['RMS'] = np.sqrt(np.mean(np.square(window)))
    features['STD'] = np.std(window)

    # 6-hour window statistics
    six_h_windows = [window[i*6:(i+1)*6] for i in range(4)]
    stds_6h = [np.std(w) for w in six_h_windows]
    means_6h = [np.mean(w) for w in six_h_windows]
    features['MeanSTD6h'] = np.mean(stds_6h)
    features['STDMean6h'] = np.std(means_6h)

    # Successive differences
    diffs = np.diff(window)
    features['RMSSD'] = np.sqrt(np.mean(np.square(diffs)))

    # Quantiles and distribution shape
    try:
        features['Mode'] = stats.mode(window, keepdims=True)[0][0]
    except:
        features['Mode'] = window[0]

    for p in [10, 25, 50, 75, 90]:
        features[f'Q{p}'] = np.percentile(window, p)
    features['Skewness'] = stats.skew(window)
    features['Kurtosis'] = stats.kurtosis(window)

    # Autocorrelations (18-28)
    for lag in range(1, 12):
        if len(window) > lag:
            corr = np.corrcoef(window[:-lag], window[lag:])[0,1]
        else:
            corr = np.nan
        features[f'Autocorr{lag}'] = corr

    # Frequency-Domain Features (29-32)
    fft_values = np.abs(fft.fft(window))[1:5]  # Harmonics 1-4
    for i, h in enumerate(fft_values, 1):
        features[f'h{i}'] = h

    return features

# 2. Create 1-hour shifted windows (updated to exclude unwanted columns)
def create_shifted_windows(df, window_size=24, shift=1):
    """Generate overlapping 24-hour windows with 1-hour shift"""
    shifted_data = []

    # Define condition columns (excluding management_changes and OK)
    condition_columns = ['oestrus', 'calving', 'lameness', 'mastitis', 'LPS',
                        'acidosis', 'other_disease', 'accidents', 'disturbance',
                        'mixing']

    # Group by cow only (not date) for continuous windows
    for cow_id, group in df.sort_values(['cow', 'date', 'hour']).groupby('cow'):
        activity = group['ACTIVITY_LEVEL'].values
        dates = group['date'].values
        hours = group['hour'].values

        # Generate all possible windows
        for i in range(0, len(activity) - window_size + 1, shift):
            window = activity[i:i + window_size]
            conditions = group.iloc[i + window_size - 1][condition_columns].to_dict()

            shifted_data.append({
                'cow': cow_id,
                'start_date': dates[i],
                'start_hour': hours[i],
                'activity_window': window.tolist(),
                **conditions
            })

    return pd.DataFrame(shifted_data)

# 3. Process your data
print("Creating 1-hour shifted windows (excluding management_changes and OK)...")
shifted_df = create_shifted_windows(df)

print("Calculating features for each window...")
feature_data = [calculate_features(w) for w in shifted_df['activity_window']]
features_df = pd.DataFrame(feature_data)

# 4. Combine into final dataset
final_df = pd.concat([
    shifted_df.drop(columns=['activity_window']),  # Keep only metadata and conditions
    features_df
], axis=1)

# 5. Save the results
output_path = "cow_activity_features_1hour_shift_clean.csv"
final_df.to_csv(output_path, index=False)

print(f"\nProcessing complete! Results saved to {output_path}")
print(f"Final dataset shape: {final_df.shape}")
print("\nFirst 3 rows of the final dataset:")
display(final_df.head(3))