In [1]:
!pip install numpy pandas scikit-learn
!pip install tensorflow

Collecting scikit-learn
  Downloading scikit_learn-1.4.0-1-cp39-cp39-macosx_12_0_arm64.whl (10.7 MB)
[K     |████████████████████████████████| 10.7 MB 2.9 MB/s eta 0:00:01
Collecting joblib>=1.2.0
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[K     |████████████████████████████████| 302 kB 34.7 MB/s eta 0:00:01
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Collecting scipy>=1.6.0
  Downloading scipy-1.12.0-cp39-cp39-macosx_12_0_arm64.whl (31.4 MB)
[K     |████████████████████████████████| 31.4 MB 35.7 MB/s eta 0:00:01
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.3.2 scikit-learn-1.4.0 scipy-1.12.0 threadpoolctl-3.2.0
You should consider upgrading via the '/Users/max/devel/uni/og/Squatify/.venv/bin/python3 -m pip install --upgrade pip' command.[0m
Collecting tensorflow
  Downloading tensorflow-2.15.0-cp39-cp39-macosx_12_0_arm64.whl (2.1 kB)
Collecting tensorflow-ma

In [5]:
# Data Loading and Preprocessing
import pandas as pd
import glob

# Path to your folder
path = '../../data/training/labled_data'

# Find all CSV files in the folder
all_files = glob.glob(path + "/Tracking_video*_labled.csv")

# List to store the dataframes
li = []

# Read each CSV file and add it to the list
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

# Combine all dataframes in the list into a single dataframe
data = pd.concat(li, axis=0, ignore_index=True)

In [6]:
# Model Training
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer


data = data[['Left hip_y', 'Right hip_y', 'Label']]

# Features
X = data.drop('Label', axis=1)
# Labels
y = data['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use mean imputation for missing values
imputer = SimpleImputer(strategy='mean')

# Fit on the training data and transform both training and test data
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)
# Initialize the model
model = RandomForestClassifier()

# Train the model
model.fit(X_train, y_train)

In [9]:
# Train multiple models and compare the accuracies

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

data = data[['Left hip_y', 'Right hip_y', 'Label']]

# Features
X = data.drop('Label', axis=1)
# Labels
y = data['Label']

# Calculate the difference between the current and previous row for 'Left hip_y' and 'Right hip_y'
X['Left hip_y_diff'] = X['Left hip_y'].diff()
X['Right hip_y_diff'] = X['Right hip_y'].diff()

# Drop rows with missing values in X and the corresponding rows in y
na_indices = X.dropna().index
X = X.loc[na_indices]
y = y.loc[na_indices]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the scaler
scaler = MinMaxScaler()

# Fit on the training data and transform both training and test data
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Use mean imputation for missing values
imputer = SimpleImputer(strategy='mean')

# Fit on the training data and transform both training and test data
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Initialize the models
models = [
    RandomForestClassifier(),
    LogisticRegression(),
    SVC(),
    DecisionTreeClassifier(),
    GradientBoostingClassifier(),
    KNeighborsClassifier(),
    MLPClassifier(max_iter=1000)
]

# Train and evaluate each model
for model in models:
    model_name = model.__class__.__name__
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

RandomForestClassifier Accuracy: 0.8627849243962988
LogisticRegression Accuracy: 0.7466711803204694
SVC Accuracy: 0.8099751749040849
DecisionTreeClassifier Accuracy: 0.811216429699842
GradientBoostingClassifier Accuracy: 0.8469871360866622
KNeighborsClassifier Accuracy: 0.8737305348679756
MLPClassifier Accuracy: 0.8245317084179643


In [10]:
import joblib
import os

# Directory where you want to save the models
save_dir = "../../data/models"

# Create the directory if it does not exist
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Train and evaluate each model
for model in models:
    model_name = model.__class__.__name__

    # Save the model
    joblib.dump(model, os.path.join(save_dir, f"{model_name}.pkl"))

In [11]:
# Model Evaluation
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os

# Directory where the models are saved
save_dir = "../../data/models"

# Load the RandomForestClassifier model from the save_dir
model_path = os.path.join(save_dir, "RandomForestClassifier.pkl")
model = joblib.load(model_path)

# Predict the labels for the test set
y_pred = model.predict(X_test)

# Print the accuracy and the classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8627849243962988
              precision    recall  f1-score   support

   Ascending       0.87      0.83      0.85      2382
  Descending       0.86      0.88      0.87      2999
       Pause       0.86      0.87      0.87      3481

    accuracy                           0.86      8862
   macro avg       0.86      0.86      0.86      8862
weighted avg       0.86      0.86      0.86      8862



In [53]:
import joblib
import os

# Directory where you want to save the model
save_dir = "../../data/models"

# Create the directory if it does not exist
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Save the model to the specified directory
joblib.dump(model, os.path.join(save_dir, "RandomForest3onlyhips.pkl"))

['RandomForest3onlyhips.pkl']

In [10]:
# Model Loading and Usage
import joblib
import os

# Directory where the models are saved
save_dir = "../../data/models"

# Load the model
model_path = os.path.join(save_dir, "DecisionTreeClassifier.pkl")
loaded_model = joblib.load(model_path)

# Use the loaded model to make predictions
# For example, predict the label for the first instance in the test set
single_prediction = loaded_model.predict([X_test.iloc[0]])
print("Prediction for the first instance in the test set:", single_prediction)

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [57]:
import pandas as pd
import os
import joblib
from sklearn.impute import SimpleImputer

# Load the model
loaded_model = joblib.load('../../data/models/RandomForestClassifier.pkl')

# Load the fitted imputer
imputer = joblib.load('../../data/models/imputer.pkl')

# Path to the tracking data
tracking_data_path = '../../data/test/tracking_data'

# Path to save the new data with predictions
new_data_path = '../../data/test/predictions'

# Create the new data directory if it doesn't exist
if not os.path.exists(new_data_path):
    os.makedirs(new_data_path)

# Iterate over all the tracking data files in the path
for filename in os.listdir(tracking_data_path):
    if filename.endswith('.csv'):
        # Load new data
        new_data = pd.read_csv(os.path.join(tracking_data_path, filename))

        # If the new data includes the 'Label' column, drop it
        if 'Label' in new_data.columns:
            new_data = new_data.drop('Label', axis=1)

        # Select only the 'Left hip_y' and 'Right hip_y' columns
        new_data = new_data[['Left hip_y', 'Right hip_y']]

        # Transform the new data using the fitted imputer
        new_data = imputer.transform(new_data)

        # Use the loaded model to make predictions
        predictions = loaded_model.predict(new_data)

        # Add the predictions back into the 'Label' column
        new_data['Label'] = predictions

        # Save the DataFrame with the new labels back to a CSV file in the new data path
        new_data.to_csv(os.path.join(new_data_path, filename), index=False)



In [68]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Assuming 'data' is already loaded as in your example

# Encoding categorical labels to integers
label_encoder = LabelEncoder()
data['Label'] = label_encoder.fit_transform(data['Label'])

# Features and labels
X = data.drop('Label', axis=1).values
y = data['Label'].values

# Imputation for missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Reshape X to be 3D [samples, time steps, features]
X = X.reshape((X.shape[0], 1, X.shape[1]))

# One-hot encode the labels
y = to_categorical(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(y.shape[1], activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=1000, batch_size=1, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
 5426/35549 [===>..........................] - ETA: 15s - loss: 0.9563 - accuracy: 0.5433

KeyboardInterrupt: 