<a href="https://colab.research.google.com/github/lilyzhizhou/Datathon-5/blob/main/Datathon_5_LSTM_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load in data
data = pd.read_csv('/content/drive/MyDrive/Machine Learning /Datathon #5/mhealth.csv')

# drop subject 9 bc their distribution is different from the others
data = data[data['subject'] != 'subject9']

print(data.shape)
data.head()

(982273, 14)


Unnamed: 0,alx,aly,alz,glx,gly,glz,arx,ary,arz,grx,gry,grz,Activity,subject
0,2.1849,-9.6967,0.63077,0.1039,-0.84053,-0.68762,-8.6499,-4.5781,0.18776,-0.44902,-1.0103,0.034483,0,subject1
1,2.3876,-9.508,0.68389,0.085343,-0.83865,-0.68369,-8.6275,-4.3198,0.023595,-0.44902,-1.0103,0.034483,0,subject1
2,2.4086,-9.5674,0.68113,0.085343,-0.83865,-0.68369,-8.5055,-4.2772,0.27572,-0.44902,-1.0103,0.034483,0,subject1
3,2.1814,-9.4301,0.55031,0.085343,-0.83865,-0.68369,-8.6279,-4.3163,0.36752,-0.45686,-1.0082,0.025862,0,subject1
4,2.4173,-9.3889,0.71098,0.085343,-0.83865,-0.68369,-8.7008,-4.1459,0.40729,-0.45686,-1.0082,0.025862,0,subject1


In [None]:
data.dtypes

alx         float64
aly         float64
alz         float64
glx         float64
gly         float64
glz         float64
arx         float64
ary         float64
arz         float64
grx         float64
gry         float64
grz         float64
Activity      int64
subject      object
dtype: object

Split into Training and Testing


---



In [4]:
from sklearn.model_selection import train_test_split

# Grouping by 'subject' and creating a list of dataframes, one per group
grouped = data.groupby('subject')
grouped_data = [group for _, group in grouped]

# Splitting the groups into training and test sets with an 80:20 ratio
train_groups, test_groups = train_test_split(grouped_data, test_size=0.2, random_state=42)

# Reassembling the training and test datasets from the groups
train_df = pd.concat(train_groups)
test_df = pd.concat(test_groups)

# Displaying the shape of the training and test sets
train_df_shape = train_df.shape
test_df_shape = test_df.shape

train_df_shape, test_df_shape

((753408, 14), (228865, 14))

Normalization

---



In [5]:
from sklearn.preprocessing import StandardScaler

# Identifying numerical columns (excluding 'subject' and 'Activity')
numerical_columns = train_df.select_dtypes(include=['float64']).columns.tolist()

# Initializing the StandardScaler
scaler = StandardScaler()

# Fitting the scaler to the training data and transforming both training and test data
train_df[numerical_columns] = scaler.fit_transform(train_df[numerical_columns])
test_df[numerical_columns] = scaler.transform(test_df[numerical_columns])

# Checking the first few rows of the normalized training data
print(train_df.shape)
train_df.head()

(753408, 14)


Unnamed: 0,alx,aly,alz,glx,gly,glz,arx,ary,arz,grx,gry,grz,Activity,subject
0,0.214821,-0.022355,0.309939,0.244241,-0.668159,-1.036175,-1.098917,0.23333,-0.594683,-0.335191,-1.01634,-0.471094,0,subject1
1,0.267516,0.023418,0.319902,0.207517,-0.662733,-1.028975,-1.094306,0.278958,-0.636633,-0.335191,-1.01634,-0.471094,0,subject1
2,0.272975,0.00901,0.319384,0.207517,-0.662733,-1.028975,-1.069191,0.286484,-0.572206,-0.335191,-1.01634,-0.471094,0,subject1
3,0.213911,0.042315,0.294847,0.207517,-0.662733,-1.028975,-1.094388,0.279577,-0.548748,-0.349689,-1.012564,-0.486675,0,subject1
4,0.275237,0.052309,0.324983,0.207517,-0.662733,-1.028975,-1.109395,0.309677,-0.538585,-0.349689,-1.012564,-0.486675,0,subject1


Sequence Creation

---



In [6]:
sensor_columns = ['alx','aly','alz','glx','gly','glz','arx','ary','arz','grx','gry','grz']

def create_sequences(data, sequence_length=100):
    sequences = []
    output = []
    for i in range(0, len(data) - sequence_length + 1, sequence_length):
        sequence = data[i:i + sequence_length]
        # Assuming 'Activity' is the target variable and is the same for the entire sequence
        label = sequence['Activity'].iloc[0]
        sequences.append(sequence[sensor_columns].values)
        output.append(label)

    return np.array(sequences), np.array(output)

# Creating sequences for training and test data
X_train, y_train = create_sequences(train_df, sequence_length=100)
X_test, y_test = create_sequences(test_df, sequence_length=100)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((7534, 100, 12), (7534,), (2288, 100, 12), (2288,))

In [16]:
# Counting the occurrences of each unique label in the training data
train_label_counts = np.unique(y_train, return_counts=True)

# Displaying the label counts for training data
print("Training Label Counts:")
for label, count in zip(train_label_counts[0], train_label_counts[1]):
    print(f'Label {label}: {count} sequences')

Training Label Counts:
Label 0: 5458 sequences
Label 1: 184 sequences
Label 2: 185 sequences
Label 3: 182 sequences
Label 4: 185 sequences
Label 5: 183 sequences
Label 6: 177 sequences
Label 7: 185 sequences
Label 8: 179 sequences
Label 9: 186 sequences
Label 10: 185 sequences
Label 11: 184 sequences
Label 12: 61 sequences


Addressing Class Imbalance

---



In [9]:
# Create separate list for each activity level
from sklearn.utils import resample

# Grouping the original training data by 'subject'
grouped_by_subject = train_df.groupby('subject')

# Creating separate lists for each class
groups_class_0 = [grouped_by_subject.get_group(subject) for subject in grouped_by_subject.groups if grouped_by_subject.get_group(subject)['Activity'].iloc[-1] == 0]
groups_class_1 = [grouped_by_subject.get_group(subject) for subject in grouped_by_subject.groups if grouped_by_subject.get_group(subject)['Activity'].iloc[-1] == 1]
groups_class_2 = [grouped_by_subject.get_group(subject) for subject in grouped_by_subject.groups if grouped_by_subject.get_group(subject)['Activity'].iloc[-1] == 2]
groups_class_3 = [grouped_by_subject.get_group(subject) for subject in grouped_by_subject.groups if grouped_by_subject.get_group(subject)['Activity'].iloc[-1] == 3]
groups_class_4 = [grouped_by_subject.get_group(subject) for subject in grouped_by_subject.groups if grouped_by_subject.get_group(subject)['Activity'].iloc[-1] == 4]
groups_class_5 = [grouped_by_subject.get_group(subject) for subject in grouped_by_subject.groups if grouped_by_subject.get_group(subject)['Activity'].iloc[-1] == 5]
groups_class_6 = [grouped_by_subject.get_group(subject) for subject in grouped_by_subject.groups if grouped_by_subject.get_group(subject)['Activity'].iloc[-1] == 6]
groups_class_7 = [grouped_by_subject.get_group(subject) for subject in grouped_by_subject.groups if grouped_by_subject.get_group(subject)['Activity'].iloc[-1] == 7]
groups_class_8 = [grouped_by_subject.get_group(subject) for subject in grouped_by_subject.groups if grouped_by_subject.get_group(subject)['Activity'].iloc[-1] == 8]
groups_class_9 = [grouped_by_subject.get_group(subject) for subject in grouped_by_subject.groups if grouped_by_subject.get_group(subject)['Activity'].iloc[-1] == 9]
groups_class_10 = [grouped_by_subject.get_group(subject) for subject in grouped_by_subject.groups if grouped_by_subject.get_group(subject)['Activity'].iloc[-1] == 10]
groups_class_11 = [grouped_by_subject.get_group(subject) for subject in grouped_by_subject.groups if grouped_by_subject.get_group(subject)['Activity'].iloc[-1] == 11]
groups_class_12 = [grouped_by_subject.get_group(subject) for subject in grouped_by_subject.groups if grouped_by_subject.get_group(subject)['Activity'].iloc[-1] == 12]

In [10]:
groups_class_0[0]['subject'] = -1
groups_class_0[0]

Unnamed: 0,alx,aly,alz,glx,gly,glz,arx,ary,arz,grx,gry,grz,Activity,subject
0,0.214821,-0.022355,0.309939,0.244241,-0.668159,-1.036175,-1.098917,0.233330,-0.594683,-0.335191,-1.016340,-0.471094,0,-1
1,0.267516,0.023418,0.319902,0.207517,-0.662733,-1.028975,-1.094306,0.278958,-0.636633,-0.335191,-1.016340,-0.471094,0,-1
2,0.272975,0.009010,0.319384,0.207517,-0.662733,-1.028975,-1.069191,0.286484,-0.572206,-0.335191,-1.016340,-0.471094,0,-1
3,0.213911,0.042315,0.294847,0.207517,-0.662733,-1.028975,-1.094388,0.279577,-0.548748,-0.349689,-1.012564,-0.486675,0,-1
4,0.275237,0.052309,0.324983,0.207517,-0.662733,-1.028975,-1.109395,0.309677,-0.538585,-0.349689,-1.012564,-0.486675,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161275,0.204786,0.077730,0.620472,-1.720030,-0.998486,1.843020,-0.154401,1.856727,1.194358,-0.715894,1.779081,0.689663,0,-1
161276,0.186849,0.007966,0.611788,-1.749418,-1.041811,1.810614,-0.241316,1.938427,1.003294,-0.715894,1.779081,0.689663,0,-1
161277,0.101684,0.070793,0.640954,-1.749418,-1.041811,1.810614,0.030072,1.216091,1.281113,-0.715894,1.779081,0.689663,0,-1
161278,0.138417,0.065577,0.607812,-1.749418,-1.041811,1.810614,0.468682,0.890345,0.922162,-0.715894,1.779081,0.689663,0,-1


In [11]:
# Creating a dictionary to store counts for each class
class_counts = {}

# Iterating over each class
for class_label in range(13):  # Assuming classes range from 0 to 12
    # Filtering data for the current class
    class_data = [grouped_by_subject.get_group(subject) for subject in grouped_by_subject.groups if grouped_by_subject.get_group(subject)['Activity'].iloc[-1] == class_label]

    # Counting the number of observations for the current class
    class_count = sum(len(group) for group in class_data)

    # Storing the count in the dictionary
    class_counts[f'Class_{class_label}'] = class_count

# Printing the counts for each class
for class_label, count in class_counts.items():
    print(f'{class_label}: {count} observations')

Class_0: 753408 observations
Class_1: 0 observations
Class_2: 0 observations
Class_3: 0 observations
Class_4: 0 observations
Class_5: 0 observations
Class_6: 0 observations
Class_7: 0 observations
Class_8: 0 observations
Class_9: 0 observations
Class_10: 0 observations
Class_11: 0 observations
Class_12: 0 observations


In [None]:
# Importing tqdm for progress bar visualization in loops
from tqdm import tqdm

# Oversampling the minority groups

# Using resample to duplicate records in groups_class_1 to match the number of records in groups_class_0
oversampled_groups_class_1 = resample(groups_class_1, replace=True, n_samples=len(groups_class_0), random_state=42)
# Similarly, oversampling records in groups_class_2 to match the number in groups_class_0
oversampled_groups_class_2 = resample(groups_class_2, replace=True, n_samples=len(groups_class_0), random_state=42)

# Combining the oversampled minority class groups with the majority class group
# This results in a balanced dataset with an equal number of records for each class
balanced_groups = oversampled_groups_class_1 + oversampled_groups_class_2 + groups_class_0

# Preparing to create a modified copy of the balanced dataset
balanced_groups_copy = []
i = 0
# Looping over each group in the balanced dataset
for i, group in tqdm(enumerate(balanced_groups)):
    # Assigning a new, unique identifier to each record in the group
    group.loc[:,'patient_nbr'] = i
    # Incrementing the identifier for the next group
    i += 1
    # Appending a copy of the modified group to the balanced_groups_copy list
    balanced_groups_copy.append(group.copy())