Columns cleaning before processing 

In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from scipy import stats, fft

# 1. Cleaning Function
def clean_data(df, columns_to_drop):
    """
    Deletes specified columns from a DataFrame.
    """
    print(f"Dropping columns: {columns_to_drop}")
    df = df.drop(columns=[col.lower() for col in columns_to_drop], errors='ignore')
    return df

# 2. Feature Calculation Function
def calculate_features(window):
    """
    Calculates 32 features for a 24-hour window of activity data.
    """
    features = {}
    window = np.array(window)
    features['Minimum'] = np.min(window)
    features['Maximum'] = np.max(window)
    features['Mean'] = np.mean(window)
    features['RMS'] = np.sqrt(np.mean(np.square(window)))
    features['STD'] = np.std(window)
    six_h_windows = [window[i*6:(i+1)*6] for i in range(4)]
    stds_6h = [np.std(w) for w in six_h_windows]
    means_6h = [np.mean(w) for w in six_h_windows]
    features['MeanSTD6h'] = np.mean(stds_6h)
    features['STDMean6h'] = np.std(means_6h)
    diffs = np.diff(window)
    features['RMSSD'] = np.sqrt(np.mean(np.square(diffs)))
    try:
        features['Mode'] = stats.mode(window, keepdims=True)[0][0]
    except:
        features['Mode'] = window[0]
    for p in [10, 25, 50, 75, 90]:
        features[f'Q{p}'] = np.percentile(window, p)
    features['Skewness'] = stats.skew(window)
    features['Kurtosis'] = stats.kurtosis(window)
    for lag in range(1, 12):
        if len(window) > lag:
            corr = np.corrcoef(window[:-lag], window[lag:])[0,1]
        else:
            corr = np.nan
        features[f'Autocorr{lag}'] = corr
    fft_values = np.abs(fft.fft(window))[1:5]
    for i, h in enumerate(fft_values, 1):
        features[f'h{i}'] = h
    return features

# 3. Window Creation Function
def create_shifted_windows(df, window_size=24, shift=1, label_columns=[]):
    """
    Generates overlapping 24-hour windows with a 1-hour shift.
    """
    shifted_data = []
    valid_labels = [col for col in label_columns if col in df.columns]

    for cow_id, group in df.sort_values(['cow', 'date', 'hour']).groupby('cow'):
        if 'activity_level' not in group.columns:
            print(f"Skipping cow {cow_id}: 'activity_level' column not found.")
            continue
        activity = group['activity_level'].values
        dates = group['date'].values
        hours = group['hour'].values
        for i in range(0, len(activity) - window_size + 1, shift):
            window = activity[i:i + window_size]
            labels = group.iloc[i + window_size - 1][valid_labels].to_dict()
            shifted_data.append({
                'cow': cow_id,
                'start_date': dates[i],
                'start_hour': hours[i],
                'activity_window': window.tolist(),
                **labels
            })
    if not shifted_data:
        print("No windows created. Please check your data and window size.")
        return pd.DataFrame()
    return pd.DataFrame(shifted_data)

# 4. Main Processing Pipeline
def process_single_dataset(input_path, output_path, label_columns, columns_to_drop):
    """
    Loads, cleans, processes, and saves a single dataset.
    """
    print(f"\nProcessing file: {input_path}")
    try:
        input_df = pd.read_csv(input_path)
    except FileNotFoundError:
        print(f"Error: The file at {input_path} was not found. Skipping.")
        return pd.DataFrame()
    
    input_df.columns = input_df.columns.str.strip().str.lower()
    input_df['date'] = pd.to_datetime(input_df['date'])

    # Step 1: Clean the data by dropping columns
    cleaned_df = clean_data(input_df.copy(), columns_to_drop)
    
    # Step 2: Create shifted windows and extract features
    shifted_df = create_shifted_windows(cleaned_df, label_columns=label_columns)
    if shifted_df.empty:
        return pd.DataFrame()
    feature_data = [calculate_features(w) for w in shifted_df['activity_window']]
    features_df = pd.DataFrame(feature_data)
    final_df = pd.concat([
        shifted_df.drop(columns=['activity_window']),
        features_df
    ], axis=1)

    try:
        final_df.to_csv(output_path, index=False)
        print(f"\nProcessing complete! Results saved to {output_path}")
        print(f"Final dataset shape: {final_df.shape}")
        # Display the first few rows of the final DataFrame
        print("\nFirst 3 rows of the final dataset:")
        display(final_df.head(3))
    except Exception as e:
        print(f"Error saving file {output_path}: {e}")
    return final_df

# 5. Main Script to run the pipeline on multiple files
def process_multiple_datasets(dataset_info):
    """
    Iterates through a list of datasets and processes each one.
    """
    for info in dataset_info:
        process_single_dataset(
            info['input_path'], 
            info['output_path'], 
            info['labels'],
            info['columns_to_drop']
        )

if __name__ == '__main__':
    # Define your dataset information here
   
    datasets_to_process = [
        {
            'input_path': r'C:\Users\lamia\Desktop\datasets\Dataset1_knn.csv',
            'output_path': r'C:\Users\lamia\Desktop\datasets\Dataset1_shifted.csv',
            'labels': ['oestrus', 'calving','mastitis','lameness','other_disease','ok'],
            'columns_to_drop': ['label', 'physio_label'] # Example columns to drop
        },
        {
            'input_path': r'C:\Users\lamia\Desktop\datasets\Dataset2_knn.csv',
            'output_path': r'C:\Users\lamia\Desktop\datasets\Dataset2_shifted.csv',
            'labels': ['oestrus','mastitis','lameness','other_disease','ok'],
            'columns_to_drop': ['label', 'physio_label','calving'] # Example columns to drop
        },
        {
            'input_path': r'C:\Users\lamia\Desktop\datasets\Dataset3_knn.csv',
            'output_path': r'C:\Users\lamia\Desktop\datasets\Dataset3_shifted.csv',
            'labels': ['oestrus','ok'],
            'columns_to_drop': ['calving','mastitis','lameness','other_disease','label', 'physio_label'] # Example columns to drop
        },
        {
            'input_path': r'C:\Users\lamia\Desktop\datasets\Dataset4_knn.csv',
            'output_path': r'C:\Users\lamia\Desktop\datasets\Dataset4_shifted.csv',
            'labels': ['oestrus', 'calving','mastitis','lameness','other_disease','ok'],
            'columns_to_drop': ['label', 'physio_label'] # Example columns to drop
        },
    ]

    # Run the main processing pipeline
    process_multiple_datasets(datasets_to_process)

  from pandas.core import (



Processing file: C:\Users\lamia\Desktop\datasets\Dataset1_knn.csv
Dropping columns: ['label', 'physio_label']

Processing complete! Results saved to C:\Users\lamia\Desktop\datasets\Dataset1_shifted.csv
Final dataset shape: (105169, 40)

First 3 rows of the final dataset:


Unnamed: 0,cow,start_date,start_hour,oestrus,calving,mastitis,lameness,other_disease,ok,Minimum,...,Autocorr6,Autocorr7,Autocorr8,Autocorr9,Autocorr10,Autocorr11,h1,h2,h3,h4
0,6601,2018-10-26,0,0.0,0.0,0.0,0.0,0.0,1,-823.60059,...,-0.456088,-0.286661,-0.234662,-0.290353,-0.041981,-0.105106,6286.812878,4934.141774,1644.078862,2406.338222
1,6601,2018-10-26,1,0.0,0.0,0.0,0.0,0.0,1,-823.60059,...,-0.483625,-0.324913,-0.261958,-0.297718,-0.125599,-0.223447,6812.403881,4948.146531,1363.256216,1880.158503
2,6601,2018-10-26,2,0.0,0.0,0.0,0.0,0.0,1,-818.0154,...,-0.396365,-0.326404,-0.24885,-0.164231,-0.0513,-0.08488,5526.173671,4322.02379,2680.149744,2822.645361



Processing file: C:\Users\lamia\Desktop\datasets\Dataset2_knn.csv
Dropping columns: ['label', 'physio_label', 'calving']

Processing complete! Results saved to C:\Users\lamia\Desktop\datasets\Dataset2_shifted.csv
Final dataset shape: (37347, 39)

First 3 rows of the final dataset:


Unnamed: 0,cow,start_date,start_hour,oestrus,mastitis,lameness,other_disease,ok,Minimum,Maximum,...,Autocorr6,Autocorr7,Autocorr8,Autocorr9,Autocorr10,Autocorr11,h1,h2,h3,h4
0,151,2015-03-02,1,0.0,0.0,0.0,0.0,1,-821.62874,1471.43776,...,-0.165465,0.022262,-0.129806,-0.342292,0.006002,-0.134922,9054.881839,2749.134118,5402.793031,5328.034727
1,151,2015-03-02,2,0.0,0.0,0.0,0.0,1,-821.62874,1471.43776,...,-0.018238,-0.019012,-0.313347,-0.355844,0.141066,-0.281861,9200.190073,2875.733548,5292.373492,5310.124693
2,151,2015-03-02,3,0.0,0.0,0.0,0.0,1,-821.62874,1471.43776,...,0.141426,-0.045229,-0.289267,-0.329412,0.007241,-0.226099,9203.629436,2877.617696,5292.140348,5313.078588



Processing file: C:\Users\lamia\Desktop\datasets\Dataset3_knn.csv
Dropping columns: ['calving', 'mastitis', 'lameness', 'other_disease', 'label', 'physio_label']

Processing complete! Results saved to C:\Users\lamia\Desktop\datasets\Dataset3_shifted.csv
Final dataset shape: (25495, 36)

First 3 rows of the final dataset:


Unnamed: 0,cow,start_date,start_hour,oestrus,ok,Minimum,Maximum,Mean,RMS,STD,...,Autocorr6,Autocorr7,Autocorr8,Autocorr9,Autocorr10,Autocorr11,h1,h2,h3,h4
0,1565,2013-10-01,1,0.0,1,-828.0,1412.17404,-193.86986,734.876162,708.842332,...,0.151619,0.190692,-0.008341,0.15677,0.056961,-0.192075,2226.686343,4369.358019,6621.015109,1946.468836
1,1565,2013-10-01,2,0.0,1,-828.0,1412.17404,-207.241797,744.775351,715.360861,...,0.072481,0.206264,0.009336,0.281243,0.085952,-0.101545,2341.385745,4677.507679,6809.627392,1984.248211
2,1565,2013-10-01,3,0.0,1,-828.0,1412.17404,-203.343851,740.675355,712.215739,...,-0.014553,0.130539,0.207801,0.282496,0.248442,-0.060675,2327.52103,4586.928892,6823.122923,1895.824696



Processing file: C:\Users\lamia\Desktop\datasets\Dataset4_knn.csv
Dropping columns: ['label', 'physio_label']


  c /= stddev[:, None]
  c /= stddev[None, :]



Processing complete! Results saved to C:\Users\lamia\Desktop\datasets\Dataset4_shifted.csv
Final dataset shape: (178296, 40)

First 3 rows of the final dataset:


Unnamed: 0,cow,start_date,start_hour,oestrus,calving,mastitis,lameness,other_disease,ok,Minimum,...,Autocorr6,Autocorr7,Autocorr8,Autocorr9,Autocorr10,Autocorr11,h1,h2,h3,h4
0,35687,2014-12-01,1,0.0,0.0,0.0,0.0,0.0,1,-827.99977,...,-0.081995,-0.257041,-0.539488,-0.422127,-0.256344,-0.107826,8478.89102,6577.72112,296.440552,5024.953049
1,35687,2014-12-01,2,0.0,0.0,0.0,0.0,0.0,1,-827.99977,...,-0.159033,-0.330226,-0.662678,-0.30131,-0.254065,-0.231518,8594.183964,6455.79081,413.889124,4985.175283
2,35687,2014-12-01,3,0.0,0.0,0.0,0.0,0.0,1,-827.99977,...,-0.160975,-0.377209,-0.629322,-0.383544,-0.11653,-0.162232,8645.428641,6424.431125,463.39749,5021.595465


In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from scipy import stats, fft

# 1. Cleaning Function
def clean_data(df, columns_to_drop):
    """
    Deletes specified columns from a DataFrame.
    """
    print(f"Dropping columns: {columns_to_drop}")
    df = df.drop(columns=[col.lower() for col in columns_to_drop], errors='ignore')
    return df

# 2. Feature Calculation Function
def calculate_features(window):
    """
    Calculates 32 features for a 24-hour window of activity data.
    """
    features = {}
    window = np.array(window)
    features['Minimum'] = np.min(window)
    features['Maximum'] = np.max(window)
    features['Mean'] = np.mean(window)
    features['RMS'] = np.sqrt(np.mean(np.square(window)))
    features['STD'] = np.std(window)
    six_h_windows = [window[i*6:(i+1)*6] for i in range(4)]
    stds_6h = [np.std(w) for w in six_h_windows]
    means_6h = [np.mean(w) for w in six_h_windows]
    features['MeanSTD6h'] = np.mean(stds_6h)
    features['STDMean6h'] = np.std(means_6h)
    diffs = np.diff(window)
    features['RMSSD'] = np.sqrt(np.mean(np.square(diffs)))
    try:
        features['Mode'] = stats.mode(window, keepdims=True)[0][0]
    except:
        features['Mode'] = window[0]
    for p in [10, 25, 50, 75, 90]:
        features[f'Q{p}'] = np.percentile(window, p)
    features['Skewness'] = stats.skew(window)
    features['Kurtosis'] = stats.kurtosis(window)
    for lag in range(1, 12):
        if len(window) > lag:
            corr = np.corrcoef(window[:-lag], window[lag:])[0,1]
        else:
            corr = np.nan
        features[f'Autocorr{lag}'] = corr
    fft_values = np.abs(fft.fft(window))[1:5]
    for i, h in enumerate(fft_values, 1):
        features[f'h{i}'] = h
    return features

# 3. Window Creation Function
def create_shifted_windows(df, window_size=24, shift=1):
    """
    Generates overlapping 24-hour windows with a 1-hour shift.
    This version returns the window's attributes to be used for feature calculation.
    """
    shifted_data = []
    
    # Ensure columns are in the correct order for iteration
    df = df.sort_values(['cow', 'date', 'hour']).reset_index(drop=True)

    for cow_id, group in df.groupby('cow'):
        if 'activity_level' not in group.columns:
            print(f"Skipping cow {cow_id}: 'activity_level' column not found.")
            continue
        
        activity = group['activity_level'].values
        
        for i in range(0, len(activity) - window_size + 1, shift):
            window = activity[i:i + window_size]
            end_of_window_row = group.iloc[i + window_size - 1]
            
            # Create a dictionary of the end-of-window row, which we will merge with features
            row_data = end_of_window_row.to_dict()
            row_data['activity_window'] = window.tolist()
            
            shifted_data.append(row_data)

    if not shifted_data:
        print("No windows created. Please check your data and window size.")
        return pd.DataFrame()
        
    return pd.DataFrame(shifted_data)

# 4. Main Processing Pipeline
def process_single_dataset(input_path, output_path, label_columns, columns_to_drop):
    """
    Loads, cleans, processes, and saves a single dataset, preserving all columns.
    """
    print(f"\nProcessing file: {input_path}")
    try:
        input_df = pd.read_csv(input_path)
    except FileNotFoundError:
        print(f"Error: The file at {input_path} was not found. Skipping.")
        return pd.DataFrame()
    
    input_df.columns = input_df.columns.str.strip().str.lower()
    input_df['date'] = pd.to_datetime(input_df['date'])

    # Step 1: Clean the data by dropping columns
    cleaned_df = clean_data(input_df.copy(), columns_to_drop)
    
    # Step 2: Create shifted windows and prepare for feature extraction
    # This function now returns a DataFrame with all original columns at the end of each window
    shifted_df_prepped = create_shifted_windows(cleaned_df)
    
    if shifted_df_prepped.empty:
        return pd.DataFrame()
    
    # Step 3: Calculate features for each window
    feature_data = [calculate_features(w) for w in shifted_df_prepped['activity_window']]
    features_df = pd.DataFrame(feature_data)

    # Step 4: Combine the original data (at the end of each window) with the new features
    # Drop the temporary 'activity_window' column and concatenate
    final_df = pd.concat([
        shifted_df_prepped.drop(columns=['activity_window']),
        features_df
    ], axis=1)

    try:
        final_df.to_csv(output_path, index=False)
        print(f"\nProcessing complete! Results saved to {output_path}")
        print(f"Final dataset shape: {final_df.shape}")
        # Display the first few rows of the final DataFrame
        print("\nFirst 3 rows of the final dataset:")
        print(final_df.head(3).to_string())
    except Exception as e:
        print(f"Error saving file {output_path}: {e}")
    return final_df

# 5. Main Script to run the pipeline on multiple files
def process_multiple_datasets(dataset_info):
    """
    Iterates through a list of datasets and processes each one.
    """
    for info in dataset_info:
        process_single_dataset(
            info['input_path'], 
            info['output_path'], 
            info['labels'],
            info['columns_to_drop']
        )

if __name__ == '__main__':
    # Define your dataset information here
    datasets_to_process = [
        {
            'input_path': r'C:\Users\lamia\Desktop\datasets\Dataset1_knn.csv',
            'output_path': r'C:\Users\lamia\Desktop\datasets\Dataset1_shift.csv',
            'labels': ['oestrus', 'calving','mastitis','lameness','other_disease','ok'],
            'columns_to_drop': ['label', 'physio_label'] # Example columns to drop
        },
        {
            'input_path': r'C:\Users\lamia\Desktop\datasets\Dataset2_knn.csv',
            'output_path': r'C:\Users\lamia\Desktop\datasets\Dataset2_shift.csv',
            'labels': ['oestrus','mastitis','lameness','other_disease','ok'],
            'columns_to_drop': ['label', 'physio_label','calving'] # Example columns to drop
        },
        {
            'input_path': r'C:\Users\lamia\Desktop\datasets\Dataset3_knn.csv',
            'output_path': r'C:\Users\lamia\Desktop\datasets\Dataset3_shift.csv',
            'labels': ['oestrus','ok'],
            'columns_to_drop': ['calving','mastitis','lameness','other_disease','label', 'physio_label'] # Example columns to drop
        },
        {
            'input_path': r'C:\Users\lamia\Desktop\datasets\Dataset4_knn.csv',
            'output_path': r'C:\Users\lamia\Desktop\datasets\Dataset4_shift.csv',
            'labels': ['oestrus', 'calving','mastitis','lameness','other_disease','ok'],
            'columns_to_drop': ['label', 'physio_label'] # Example columns to drop
        },
    ]

    # Run the main processing pipeline
    process_multiple_datasets(datasets_to_process)

  from pandas.core import (



Processing file: C:\Users\lamia\Desktop\datasets\Dataset1_knn.csv
Dropping columns: ['label', 'physio_label']

Processing complete! Results saved to C:\Users\lamia\Desktop\datasets\Dataset1_shift.csv
Final dataset shape: (105169, 44)

First 3 rows of the final dataset:
    cow       date  hour  in_alleys      rest       eat  activity_level  oestrus  calving  lameness  mastitis  other_disease  ok    Minimum     Maximum       Mean         RMS         STD   MeanSTD6h   STDMean6h       RMSSD       Mode         Q10        Q25        Q50         Q75         Q90  Skewness  Kurtosis  Autocorr1  Autocorr2  Autocorr3  Autocorr4  Autocorr5  Autocorr6  Autocorr7  Autocorr8  Autocorr9  Autocorr10  Autocorr11           h1           h2           h3           h4
0  6601 2018-10-26    23     35.875  3564.119     0.000      -814.00737      0.0      0.0       0.0       0.0            0.0   1 -823.60059  1124.20995  25.020170  714.387775  713.949497  522.056078  463.501708  922.730447 -823.60059 -815.977

  c /= stddev[:, None]
  c /= stddev[None, :]



Processing complete! Results saved to C:\Users\lamia\Desktop\datasets\Dataset4_shift.csv
Final dataset shape: (178296, 44)

First 3 rows of the final dataset:
     cow       date  hour  in_alleys      rest       eat  activity_level  oestrus  calving  lameness  mastitis  other_disease  ok    Minimum     Maximum       Mean         RMS         STD   MeanSTD6h   STDMean6h       RMSSD       Mode         Q10         Q25         Q50        Q75          Q90  Skewness  Kurtosis  Autocorr1  Autocorr2  Autocorr3  Autocorr4  Autocorr5  Autocorr6  Autocorr7  Autocorr8  Autocorr9  Autocorr10  Autocorr11           h1           h2          h3           h4
0  35687 2014-12-01    24    692.013   690.594  2115.623       840.44712      0.0      0.0       0.0       0.0            0.0   1 -827.99977  1487.30062  -3.057357  840.047072  840.041508  587.915639  507.296295  810.957159 -827.99977 -822.352726 -768.219140 -281.168225  642.11166  1371.245356  0.566935 -1.187437   0.539592   0.370940   0.187888   0