Data loading and preprocessing without augmentation

In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import os

def load_data_from_folders(base_dir, activity_folders):
    column_names = ['time', 'avg_rss12', 'var_rss12', 'avg_rss13', 'var_rss13', 'avg_rss23', 'var_rss23']
    all_data = pd.DataFrame()
    label_encoder = LabelEncoder()
    scaler = StandardScaler()
    
    # Load data from each folder
    for activity in activity_folders:
        folder_path = os.path.join(base_dir, activity)
        files = [f for f in os.listdir(folder_path) if not os.path.isdir(os.path.join(folder_path, f))]
        activity_data = pd.DataFrame()
        
        for file in files:
            file_path = os.path.join(folder_path, file)
            try:
                # Ensure data does not include any non-numeric values or unexpected strings
                temp_df = pd.read_csv(file_path, comment='#', header=None, names=column_names, on_bad_lines='skip')
                temp_df = temp_df.dropna()  # Drop any rows with NaN values
                temp_df = temp_df[~temp_df.applymap(lambda x: isinstance(x, str)).any(axis=1)]  # Remove any rows that contain strings
                temp_df['label'] = activity
                activity_data = pd.concat([activity_data, temp_df], ignore_index=True)
            except pd.errors.ParserError as e:
                print(f"Skipping file {file} due to errors: {e}")
        
        all_data = pd.concat([all_data, activity_data], ignore_index=True)

    # Encode labels
    all_data['label'] = label_encoder.fit_transform(all_data['label'])
    
    # Check for non-numeric values
    feature_columns = ['avg_rss12', 'var_rss12', 'avg_rss13', 'var_rss13', 'avg_rss23', 'var_rss23']
    all_data[feature_columns] = all_data[feature_columns].apply(pd.to_numeric, errors='coerce')  # Coerce any remaining non-numeric entries to NaN, and then drop them
    all_data = all_data.dropna(subset=feature_columns)
    
    # Normalize features
    all_data[feature_columns] = scaler.fit_transform(all_data[feature_columns])
    
    return all_data, label_encoder.classes_




# Create sequences from the data
def create_sequences(data, seq_length):
    sequences = []
    labels = []
    feature_columns = ['avg_rss12', 'var_rss12', 'avg_rss13', 'var_rss13', 'avg_rss23', 'var_rss23']
    for i in range(len(data) - seq_length):
        seq = data[feature_columns].iloc[i:i + seq_length].values
        label = data['label'].iloc[i + seq_length - 1]
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

# Parameters
base_dir = '/Users/habib/Desktop/DL/data/Activity-Recognition-system-based-on-Multisensor-data-fusion-(AReM)'
activity_folders = ['bending1', 'bending2', 'cycling', 'lying', 'sitting', 'standing', 'walking']
seq_length = 50

# Load and preprocess data
data, classes = load_data_from_folders(base_dir, activity_folders)

# Generate sequences
sequences, labels = create_sequences(data, seq_length)

# Split data and convert to tensors
X_train_val, X_test, y_train_val, y_test = train_test_split(sequences, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 20% of the entire dataset for validation

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)

# Create DataLoaders
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=64, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=64, shuffle=False)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=64, shuffle=False)

print(train_loader)


  temp_df = temp_df[~temp_df.applymap(lambda x: isinstance(x, str)).any(axis=1)]  # Remove any rows that contain strings
  temp_df = temp_df[~temp_df.applymap(lambda x: isinstance(x, str)).any(axis=1)]  # Remove any rows that contain strings
  temp_df = temp_df[~temp_df.applymap(lambda x: isinstance(x, str)).any(axis=1)]  # Remove any rows that contain strings
  temp_df = temp_df[~temp_df.applymap(lambda x: isinstance(x, str)).any(axis=1)]  # Remove any rows that contain strings
  temp_df = temp_df[~temp_df.applymap(lambda x: isinstance(x, str)).any(axis=1)]  # Remove any rows that contain strings
  temp_df = temp_df[~temp_df.applymap(lambda x: isinstance(x, str)).any(axis=1)]  # Remove any rows that contain strings
  temp_df = temp_df[~temp_df.applymap(lambda x: isinstance(x, str)).any(axis=1)]  # Remove any rows that contain strings
  temp_df = temp_df[~temp_df.applymap(lambda x: isinstance(x, str)).any(axis=1)]  # Remove any rows that contain strings
  temp_df = temp_df[~temp_df.app

<torch.utils.data.dataloader.DataLoader object at 0x1079df490>


In [4]:
print("x/y_train shape ", X_train.shape, y_train.shape)
print("x/y_validation shape ", X_val.shape, y_val.shape)
print("x/y_validation shape ", X_test.shape, y_test.shape)

x/y_train shape  torch.Size([25023, 50, 6]) torch.Size([25023])
x/y_validation shape  torch.Size([8342, 50, 6]) torch.Size([8342])
x/y_validation shape  torch.Size([8342, 50, 6]) torch.Size([8342])


In [7]:
def time_shift(sample, max_shift):
    """
    Perform time-based augmentation by randomly shifting the time axis of the input sample.
    
    Args:
    - sample (ndarray): Input sample to be augmented.
    - max_shift (int): Maximum number of time steps to shift the sample.
    
    Returns:
    - augmented_sample (ndarray): Augmented sample with the time shift applied.
    """
    shift = np.random.randint(-max_shift, max_shift + 1)
    if shift >= 0:
        augmented_sample = np.pad(sample[:-shift], ((shift, 0), (0, 0)), mode='constant')
    else:
        augmented_sample = np.pad(sample[-shift:], ((0, -shift), (0, 0)), mode='constant')
    return augmented_sample


In [8]:
def add_noise(sample, noise_level):
    """
    Add Gaussian noise to the input sample.
    
    Args:
    - sample (ndarray): Input sample to which noise will be added.
    - noise_level (float): Standard deviation of the Gaussian noise.
    
    Returns:
    - noisy_sample (ndarray): Sample with noise added.
    """
    noise = np.random.normal(scale=noise_level, size=sample.shape)
    noisy_sample = sample + noise
    return noisy_sample


In [9]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import os

def load_data_from_folders(base_dir, activity_folders):
    column_names = ['time', 'avg_rss12', 'var_rss12', 'avg_rss13', 'var_rss13', 'avg_rss23', 'var_rss23']
    all_data = pd.DataFrame()
    label_encoder = LabelEncoder()
    scaler = StandardScaler()
    
    # Load data from each folder
    for activity in activity_folders:
        folder_path = os.path.join(base_dir, activity)
        files = [f for f in os.listdir(folder_path) if not os.path.isdir(os.path.join(folder_path, f))]
        activity_data = pd.DataFrame()
        
        for file in files:
            file_path = os.path.join(folder_path, file)
            try:
                # Ensure data does not include any non-numeric values or unexpected strings
                temp_df = pd.read_csv(file_path, comment='#', header=None, names=column_names, on_bad_lines='skip')
                temp_df = temp_df.dropna()  # Drop any rows with NaN values
                temp_df = temp_df[~temp_df.applymap(lambda x: isinstance(x, str)).any(axis=1)]  # Remove any rows that contain strings
                temp_df['label'] = activity
                activity_data = pd.concat([activity_data, temp_df], ignore_index=True)
            except pd.errors.ParserError as e:
                print(f"Skipping file {file} due to errors: {e}")
        
        all_data = pd.concat([all_data, activity_data], ignore_index=True)

    # Encode labels
    all_data['label'] = label_encoder.fit_transform(all_data['label'])
    
    # Check for non-numeric values
    feature_columns = ['avg_rss12', 'var_rss12', 'avg_rss13', 'var_rss13', 'avg_rss23', 'var_rss23']
    all_data[feature_columns] = all_data[feature_columns].apply(pd.to_numeric, errors='coerce')  # Coerce any remaining non-numeric entries to NaN, and then drop them
    all_data = all_data.dropna(subset=feature_columns)
    
    # Normalize features
    all_data[feature_columns] = scaler.fit_transform(all_data[feature_columns])
    
    return all_data, label_encoder.classes_

def create_sequences(data, seq_length):
    sequences = []
    labels = []
    feature_columns = ['avg_rss12', 'var_rss12', 'avg_rss13', 'var_rss13', 'avg_rss23', 'var_rss23']
    for i in range(len(data) - seq_length):
        seq = data[feature_columns].iloc[i:i + seq_length].values
        label = data['label'].iloc[i + seq_length - 1]
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

def augment_data(samples, target_length):
    augmented_samples = []
    for sample in samples:
        augmented_sample = sample.copy()  # Make a copy to avoid modifying the original data
        augmented_sample = time_shift(augmented_sample, max_shift=5)  # Example: Maximum shift of 5 time steps
        augmented_sample = add_noise(augmented_sample, noise_level=0.1)  # Example: Add Gaussian noise with standard deviation 0.1
        
        # Pad or crop the sample to the target length
        if augmented_sample.shape[0] < target_length:
            pad_width = ((0, target_length - augmented_sample.shape[0]), (0, 0))
            augmented_sample = np.pad(augmented_sample, pad_width, mode='constant', constant_values=0)
        elif augmented_sample.shape[0] > target_length:
            augmented_sample = augmented_sample[:target_length]
        
        augmented_samples.append(augmented_sample)
    
    return np.array(augmented_samples)

# Parameters
base_dir = '/Users/habib/Desktop/University/Spring-2024/CS73389H/Project/activity+recognition+system+based+on+multisensor+data+fusion+arem'
activity_folders = ['bending1', 'bending2', 'cycling', 'lying', 'sitting', 'standing', 'walking']
seq_length = 50
target_length = 50  # Length to which augmented samples will be padded or cropped

# Load and preprocess data
data, classes = load_data_from_folders(base_dir, activity_folders)

# Generate sequences
sequences, labels = create_sequences(data, seq_length)

# Split data into train, val, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(sequences, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 20% of the entire dataset for validation

# Augment training data
X_train_augmented = augment_data(X_train, target_length=target_length)
X_train = np.concatenate([X_train, X_train_augmented], axis=0)
y_train = np.concatenate([y_train, y_train], axis=0)

# Convert data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)

# Create DataLoaders
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=64, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=64, shuffle=False)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=64, shuffle=False)


  temp_df = temp_df[~temp_df.applymap(lambda x: isinstance(x, str)).any(axis=1)]  # Remove any rows that contain strings
  temp_df = temp_df[~temp_df.applymap(lambda x: isinstance(x, str)).any(axis=1)]  # Remove any rows that contain strings
  temp_df = temp_df[~temp_df.applymap(lambda x: isinstance(x, str)).any(axis=1)]  # Remove any rows that contain strings
  temp_df = temp_df[~temp_df.applymap(lambda x: isinstance(x, str)).any(axis=1)]  # Remove any rows that contain strings
  temp_df = temp_df[~temp_df.applymap(lambda x: isinstance(x, str)).any(axis=1)]  # Remove any rows that contain strings
  temp_df = temp_df[~temp_df.applymap(lambda x: isinstance(x, str)).any(axis=1)]  # Remove any rows that contain strings
  temp_df = temp_df[~temp_df.applymap(lambda x: isinstance(x, str)).any(axis=1)]  # Remove any rows that contain strings
  temp_df = temp_df[~temp_df.applymap(lambda x: isinstance(x, str)).any(axis=1)]  # Remove any rows that contain strings
  temp_df = temp_df[~temp_df.app

In [10]:
print("x/y_train shape ", X_train.shape, y_train.shape)
print("x/y_validation shape ", X_val.shape, y_val.shape)
print("x/y_validation shape ", X_test.shape, y_test.shape)

x/y_train shape  torch.Size([50046, 50, 6]) torch.Size([50046])
x/y_validation shape  torch.Size([8342, 50, 6]) torch.Size([8342])
x/y_validation shape  torch.Size([8342, 50, 6]) torch.Size([8342])
