# Final Project - Group M

Team Members:
1. Akshay Augustine Sheby - 5123774
2. Krishnapriya Krishnan Santhadevi - 5123779
3. Megha Eldho - 5123773
4. Ranjitha Umesh - 5123734

# Data preprocessing 

1. Import libraries:
The required libraries are imported at the beginning of the code.

2. Data Loading:
   - An empty list called `dataframes` is created to store DataFrames.
   - The variable `data_folder` is set to the path of the data folder.
   - A loop iterates over each folder in the `data_folder`.
   - Within the loop, it checks if the current path is a directory using `os.path.isdir()`.
   - If the current path is a directory, another loop iterates over the files in that folder.

3. Data Processing:
   - It checks if the file is a `.csv` file starting with "Clipped" using `file_name.startswith('Clipped')` and `file_name.endswith('.csv')`.
   - If it meets the condition, the CSV file is read using `pd.read_csv()` and stored in a DataFrame called `dataframe`.
   - The corresponding `.stepMixed` file path is obtained by replacing "Clipped" in the CSV file path with an empty string and adding the extension ".stepMixed".
   - The `.stepMixed` file is read using `pd.read_csv()` with `header=None` to treat the first row as data, and the resulting DataFrame is stored in `stepmix_data`.
   - Column names ("start" and "end") are assigned to `stepmix_data` using `stepmix_data.columns`.
   - Two new columns named 'start_step_labels' and 'end_step_labels' are added to the `dataframe` DataFrame, initially filled with NaN values.
   - Iterating over each row in `stepmix_data`, it retrieves the start and end step values.
   - The corresponding rows in `dataframe` are marked with 1 for 'start_step_labels' and 'end_step_labels'.
   - Additionally, it iterates from 0 to `k-1` and marks the rows surrounding the start and end steps with 1 for 'start_step_labels' and 'end_step_labels'.

4. Data Aggregation:
   - The modified `dataframe` is appended to the `dataframes` list.
   - After processing all the files in the folders, the DataFrames in `dataframes` are concatenated into a single DataFrame called `training_dataset` using `pd.concat()`.
   - Column names are modified to remove spaces using `training_dataset.columns.str.replace(" ", "")`.
   - The 'Activity' column is dropped from `training_dataset` using `training_dataset.drop()`.
   - Missing values in `training_dataset` are filled with 0 using `training_dataset.fillna(0)`.
   - The index of `training_dataset` is reset using `training_dataset.reset_index(drop=True)`.

5. Data Export:
   - The `training_dataset` DataFrame is saved to a CSV file called "Kaggle competition dataset/data_new_+10_-10.csv" using `training_dataset.to_csv()`.

In [1]:
#Import required libraries
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

dataframes = []  # Creating an empty list to store DataFrames
data_folder = 'Kaggle competition dataset/data'  # Path to the data folder

k = 1
# Loop over each folder in the data folder
for folder_name in os.listdir(data_folder):
    folder_path = os.path.join(data_folder, folder_name)  # Get the full path of the current folder
    print("Accessing ----> ", folder_path)  # Print the path of the current folder
    
    if os.path.isdir(folder_path):  # Check if the current path is a directory
        
        # Iterate over the files in the current folder
        for file_name in os.listdir(folder_path):
            
            # Check if the file is a .stepMixed file
            if file_name.startswith('Clipped') and file_name.endswith('.csv'):
                csv_path = os.path.join(folder_path, file_name)  # Get the full path of the CSV file
                dataframe = pd.read_csv(csv_path)  # Read the CSV file using pandas
                sm_file_name = csv_path.replace("Clipped", "") + ".stepMixed"  # Get the corresponding .stepMixed file path
                stepmix_data = pd.read_csv(sm_file_name, header=None)  # Read the .stepMixed file using pandas, treating the first row as data
                stepmix_data.columns = ["start", "end"]  # Set column names for the stepmix_data DataFrame
                
                dataframe["start_step_labels"] = np.nan  # Add a new column named 'start_step_labels' filled with NaNs to the dataframe DataFrame
                dataframe["end_step_labels"] = np.nan  # Add a new column named 'end_step_labels' filled with NaNs to the dataframe DataFrame
                    
                # Iterate over each row in stepmix_data DataFrame
                for step_data, row in stepmix_data.iterrows():
                    start_step = row['start']  # Get the value of 'start' column in the current row
                    end_step = row['end']  # Get the value of 'end' column in the current row
                    
                    # Mark 1 for start_step_labels if start_step value matches the row index and mark 1 for end_step_labels if end_step value matches the row index
                    dataframe.loc[start_step, 'start_step_labels'] = 1
                    dataframe.loc[end_step, 'end_step_labels'] = 1
                    
                    for i in range(k):  # Iterate from 0 to k-1
                        dataframe.loc[start_step + (i + 1), 'start_step_labels'] = 1
                        dataframe.loc[start_step - (i + 1), 'start_step_labels'] = 1
                        dataframe.loc[end_step + (i + 1), 'end_step_labels'] = 1
                        dataframe.loc[end_step - (i + 1), 'end_step_labels'] = 1

                dataframes.append(dataframe)  # Append the modified dataframe DataFrame to the dataframes list
                
training_dataset = pd.concat(dataframes)  # Concatenate all the DataFrames in dataframes list into a single DataFrame
training_dataset.columns = training_dataset.columns.str.replace(" ", "")  # Remove spaces from column names
training_dataset = training_dataset.drop(["Activity"], axis=1)  # Drop the 'Activity' column from the training_dataset DataFrame
training_dataset = training_dataset.fillna(0)  # Fill the missing values with 0 in the training_dataset DataFrame
training_dataset = training_dataset.reset_index(drop=True)  # Reset the index of the training_dataset DataFrame

training_dataset.to_csv("Kaggle competition dataset/data_new_+10_-10.csv")  # Save the training_dataset DataFrame to a CSV file
print(training_dataset)  # Print the training_dataset DataFrame

Accessing ---->  Kaggle competition dataset/data/.DS_Store
Accessing ---->  Kaggle competition dataset/data/person_8
Accessing ---->  Kaggle competition dataset/data/person_12
Accessing ---->  Kaggle competition dataset/data/perosn_2
Accessing ---->  Kaggle competition dataset/data/person_5
Accessing ---->  Kaggle competition dataset/data/testdata.csv
Accessing ---->  Kaggle competition dataset/data/person_3
Accessing ---->  Kaggle competition dataset/data/person_4
Accessing ---->  Kaggle competition dataset/data/person_11
Accessing ---->  Kaggle competition dataset/data/perosn_7
Accessing ---->  Kaggle competition dataset/data/person_10
        AccelX_5  AccelY_5  AccelZ_5   GyroX_5   GyroY_5   GyroZ_5   
0       1.370639  3.077730 -9.138201  0.026021 -0.025069  0.026772  \
1       1.380689  3.039416 -9.200333  0.038649 -0.038450  0.035676   
2       1.378264  2.981465 -9.305405  0.043459 -0.038100  0.031424   
3       1.423814  2.944719 -9.343213  0.042548 -0.028578  0.029073   
4   