<a href="https://colab.research.google.com/github/jamesduol/interpretable-ml-book/blob/master/HHAR_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import glob
import numpy as np
import matplotlib.pyplot as plt # import the matplotlib library
import os

from scipy import stats  # Add this import statement
import tensorflow as tf
import seaborn as sns

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Flatten
from tensorflow.keras.utils import to_categorical
import pickle
import matplotlib.pyplot as plt

from pylab import rcParams
from sklearn import metrics
from sklearn.model_selection import train_test_split

%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
plt.rc('text', usetex=True)
plt.rc('font', family='serif')
plt.rcParams['text.usetex'] = False

In [8]:
!wget https://archive.ics.uci.edu/static/public/344/heterogeneity+activity+recognition.zip

--2024-09-07 10:13:11--  https://archive.ics.uci.edu/static/public/344/heterogeneity+activity+recognition.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘heterogeneity+activity+recognition.zip’

heterogeneity+activ     [             <=>    ] 784.01M  26.7MB/s    in 33s     

2024-09-07 10:13:44 (23.8 MB/s) - ‘heterogeneity+activity+recognition.zip’ saved [822098071]



In [9]:
%%capture
!unzip heterogeneity+activity+recognition.zip -O hhar_exp.zip
!unzip hhar_exp.zip

In [11]:
!unzip -l heterogeneity+activity+recognition.zip

Archive:  heterogeneity+activity+recognition.zip
  Length      Date    Time    Name
---------  ---------- -----   ----
777127275  2023-05-22 15:21   Activity recognition exp.zip
 44970540  2023-05-22 15:21   Still exp.zip
---------                     -------
822097815                     2 files


In [12]:
!unzip 'heterogeneity+activity+recognition.zip'

Archive:  heterogeneity+activity+recognition.zip
 extracting: Activity recognition exp.zip  
 extracting: Still exp.zip           


In [13]:
!unzip 'Activity recognition exp.zip'

Archive:  Activity recognition exp.zip
   creating: Activity recognition exp/
  inflating: Activity recognition exp/.DS_Store  
   creating: __MACOSX/
   creating: __MACOSX/Activity recognition exp/
  inflating: __MACOSX/Activity recognition exp/._.DS_Store  
  inflating: Activity recognition exp/Phones_accelerometer.csv  
  inflating: Activity recognition exp/Phones_gyroscope.csv  
  inflating: Activity recognition exp/readme.txt  
  inflating: __MACOSX/Activity recognition exp/._readme.txt  
  inflating: Activity recognition exp/Watch_accelerometer.csv  
  inflating: Activity recognition exp/Watch_gyroscope.csv  


In [2]:
# Read each CSV file into a separate DataFrame
df1 = pd.read_csv('Activity recognition exp/Phones_accelerometer.csv')
df2 = pd.read_csv('Activity recognition exp/Phones_gyroscope.csv')
df3 = pd.read_csv('Activity recognition exp/Watch_accelerometer.csv')
df4 = pd.read_csv('Activity recognition exp/Watch_gyroscope.csv')

# Concatenate the DataFrames
df = pd.concat([df1, df2, df3, df4], axis=0) # Concatenate DataFrames vertically

df.head()

Unnamed: 0,Index,Arrival_Time,Creation_Time,x,y,z,User,Model,Device,gt
0,0,1424696633908,1424696631913248572,-5.958191,0.688065,8.135345,a,nexus4,nexus4_1,stand
1,1,1424696633909,1424696631918283972,-5.95224,0.670212,8.136536,a,nexus4,nexus4_1,stand
2,2,1424696633918,1424696631923288855,-5.995087,0.653549,8.204376,a,nexus4,nexus4_1,stand
3,3,1424696633919,1424696631928385290,-5.942718,0.676163,8.128204,a,nexus4,nexus4_1,stand
4,4,1424696633929,1424696631933420691,-5.991516,0.641647,8.135345,a,nexus4,nexus4_1,stand


In [3]:
activities_map = {
     "0": "other",
     "1": "lying",
     "2": "sitting",
     "3": "standing",
     "4": "walking",
     "5": "running",
     "6": "cycling",
     "7": "Nordic walking",
     "9": "watching TV",
    "10": "computer work",
    "11": "car driving",
    "12": "ascending stairs",
    "13": "descending stairs",
    "16": "vacuum cleaning",
    "17": "ironing",
    "18": "folding laundry",
    "19": "house cleaning",
    "20": "playing soccer",
    "24": "rope jumping"
}

In [None]:
print("Starting")

# Define the column names for the accelerometer and gyroscope data
column_names = ['timestamp', 'x', 'y', 'z', 'label']

# Initialize an empty DataFrame
df = pd.DataFrame(columns=column_names)

# List to hold data frames for concatenation
data_frames = []

# Load CSV files from 'Activity recognition exp/' directory
for file in glob.glob(os.path.join('Activity recognition exp/', '*.csv')):
    # Extract the subject/device info from the file name
    file_name = os.path.basename(file).split('.')[0]
    print(f"Processing: {file_name}")

    # Read the CSV file
    dft = pd.read_csv(file, sep=",")

    # Add a column to identify the device/subject based on the file name
    dft['device'] = file_name

    # Append the DataFrame to the list
    data_frames.append(dft)

# Concatenate all the data frames into one DataFrame
df = pd.concat(data_frames, ignore_index=True)

print("Finished")
print(df.head())  # Display the first few rows of the concatenated DataFrame

Starting
Processing: Watch_accelerometer
Processing: Phones_gyroscope
Processing: Watch_gyroscope
Processing: Phones_accelerometer


In [None]:
# Define the directory where the unzipped files are located
directory = 'Activity recognition exp/'

# Define column names for the accelerometer and gyroscope data
column_names = ['timestamp', 'x', 'y', 'z', 'label']
df = pd.DataFrame(columns=column_names)

# Iterate through each relevant CSV file in the 'Activity recognition exp/' directory
for file in glob.glob(os.path.join(directory, '*.csv')):
    file_name = os.path.basename(file).split('.')[0]  # Get the base name without extension
    print(f"Processing: {file_name}")

    # Read the CSV file
    dft = pd.read_csv(file, sep=",")

    # Drop columns that are completely empty (all-NA)
    dft = dft.dropna(axis=1, how='all')

    # Add a column to track the file or device source
    dft['source'] = file_name

    # Append the data to the main DataFrame
    df = pd.concat([df, dft], ignore_index=True)

# Display the combined DataFrame
print(df.head())

In [None]:
# Check if the columns 'index' and 'Unnamed: 0' exist before attempting to drop them
columns_to_drop = ['index', 'Unnamed: 0']
existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]

if existing_columns_to_drop:
    df.drop(labels=existing_columns_to_drop, axis='columns', inplace=True)

# Optionally convert 'timestamp' to datetime if needed
# df['timestamp'] = pd.to_datetime(df['timestamp'])

df.head()

In [None]:
# Summary statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())

# Handle missing values (if any)
df = df.dropna()

In [None]:
# Check column names in the DataFrame
print(df.columns)

In [None]:
# Replace 'label' with the actual column name if it is different
correct_label_column = 'label'  # Change this to the correct column name if different

# Visualize the distribution of the labels
plt.figure(figsize=(12, 6))
sns.countplot(x=correct_label_column, data=df)
plt.title('Distribution of Activity Labels')
plt.xlabel('Activity Label')
plt.ylabel('Count')
plt.show()