# Classification Model

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import os

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score

### Load The Data
Using our cleaned data

In [None]:
def load_cleaned_data(data_dir):
    all_files = glob(os.path.join(data_dir, "cleaned_boxing_data_*.csv"))
    dataframes = {}
    for file in all_files:
        df = pd.read_csv(file)
        movement_type = file.split("_")[-3]
        df['movement_type'] = movement_type
        file_name = os.path.basename(file)
        dataframes[file_name] = df
    return dataframes

In [None]:
data_dir = '../data/cleaned'
dfs = load_cleaned_data(data_dir)

for file_name, df in dfs.items():
    print(f"File: {file_name}")
    print(df.info())
    print(df.head())
    print(df.describe())

## Distribution of Movement Types

In [None]:
movement_counts = pd.Series(dtype=int)

for file_name, df in dfs.items():
    file_movement_counts = df['movement_type'].value_counts()
    movement_counts = movement_counts.add(file_movement_counts, fill_value=0)
    
plt.figure(figsize=(10, 6))
movement_counts.plot(kind='bar')
plt.title('Distribution of Movement Types')
plt.xlabel('Movement Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# Prepare Features and Target

In [2]:
def prepare_data(dataframes):
    X_all = []
    y_all = []
    for df in dataframes.values():
        feature_columns = [col for col in df.columns if cols.endswith(('_x', '_y', '_z', '_angle'))]
        x = df[feature_columns]
        y = df['movement_type']
        X_all.append(X)
        y_all.append(y)
    
    X_combined = pd.concat(X_all, axis=0)
    y_combined = pd.concat(y_all, axis=0)
    
    # split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

## Model Training And Evaluation

In [None]:
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    models ={
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Gradient Boosting':GradientBoostingClassifier(n_estimators=100, random_state=42),
        'SVM': SVC(kernel='rbf', random_state=42),
        'KNN': KNeighborsClassifier(n_neighbors=5)
    }
    
    results = {}
    
    for name, model in models.items():
        
        # train the model
        model.fit(X_train, y_train)
        
        #make predictions
        y_pred = model.predict(X_test)
        
        # calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        
        # cross validate
        cv_scores = cross_val_score(model, X_train, y_train, cv = 5)
        
        results[name] = {
            'model': model,
            'accuracy': accuracy,
            'cv_scores': cv_scores,
            'classification_report': classification_report(y_test, y_pred)
        }
        
        print(f"\nResults for {name}:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Cross Validation scores: {cv_scores}")
        print(f"Average CV score: {cv_scores.mean():.4f}")
        print(f"Classification Report:")
        print(results[name]['classification_report'])

In [None]:
for file_name, df in dfs.items():
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': feature_columns,
        'importance': rf_classifier.feature_importances_
    }).sort_values('importance', ascending=False)

    plt.figure(figsize=(12, 8))
    sns.barplot(x='importance', y='feature', data=feature_importance.head(20))
    plt.title(f'Top 20 Most Important Features - {file_name}')
    plt.show()

In [None]:
import os
import joblib

# Specify the directory where you want to save the models
save_dir = "../models/"

# Ensure the directory exists, if not, create it
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

for file_name, df in dfs.items():
    # Create the full file path including the directory and file name
    model_filename = os.path.join(save_dir, f'boxing_movement_classifier_{file_name}.joblib')
    
    # Save the model for each file
    joblib.dump(rf_classifier, model_filename)
    print(f"Model for {file_name} saved as '{model_filename}'\n")
