In [None]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score, ConfusionMatrixDisplay, make_scorer

In [None]:
# Attaching project directory
sys.path.append(os.path.dirname(os.getcwd()))

# Pathing imports
from src import GetPath

In [None]:
DATA_FOLDER = GetPath().shared_data()
ABN_B1 = os.path.join(DATA_FOLDER, 'raw', 'abnormal_b1.csv')
ABN_B2 = os.path.join(DATA_FOLDER, 'raw', 'abnormal_b2.csv')
ABN_B3 = os.path.join(DATA_FOLDER, 'raw', 'abnormal_b3.csv')

RANDOM_STATE = 42
SAMPLE_SIZE = 50

np.random.seed(seed=RANDOM_STATE)

In [None]:
df = pd.read_csv(ABN_B1)
df.head()

In [None]:
# Checking abnormalities throughout experiments
for i in range(1, 4):
    # Experiment batch
    experiment = f"ABN_B{i}"
    # Load csv data
    df = pd.read_csv(eval(experiment))

    # Plotting countplot
    sns.countplot(df, x='ABN')
    plt.title(f"Experiment {experiment}")
    plt.show()

In [None]:
df_all = pd.DataFrame()

for idx in range(3):
    # Experiment batch
    experiment = f"ABN_B{idx+1}"
    # Load csv data
    df_all = pd.concat([df_all, pd.read_csv(eval(experiment))]).reset_index(drop=True)

# Plotting countplot
sns.countplot(df_all, x='ABN')
plt.title(f"Abnormal Level Count")
plt.show()

## Relationship between Hovering and Abnormal Level

- Hovering BTM and Abnormal Level (down trend)
- Hovering SUR and Abnormal Level (up trend)

In [None]:
# By batch of experiment
# Position state
positions = ['BTM', 'MID', 'SUR']

for position in positions:
    # Plotting hovering
    fig, axes = plt.subplots(figsize=(14,6), nrows=1, ncols=3)
    for idx, ax in enumerate(axes.flatten()):
        # Experiment batch
        experiment = f"ABN_B{idx+1}"
        # Load csv data
        df = pd.read_csv(eval(experiment))

        sns.barplot(df, x='ABN', y=f"Hovering {position}", ax=ax)
        ax.set_title(f"Experiment {experiment}")

    fig.suptitle(f"Relationship Between Hovering {position} and Abnormal Level")
    plt.tight_layout()
    plt.show()

In [None]:
# All experiment
# Position state
positions = ['BTM', 'MID', 'SUR']

fig, axes = plt.subplots(figsize=(14,6), nrows=1, ncols=len(positions))
for idx, ax in enumerate(axes.flatten()):
    sns.barplot(df_all, x='ABN', y=f"Hovering {positions[idx]}", ax=ax)

fig.suptitle(f"Relationship Between Hovering Position and Abnormal Level")
plt.tight_layout()
plt.show()

## Relationship between Fast Swim and Abnormal Level
- Fast Swim MID and Abnormal Level (up trend)
- Fast Swim SUR and Abnormal Level (up trend)

In [None]:
# Position state
positions = ['Btm', 'Mid', 'SUR']

for position in positions:
    # Plotting hovering
    fig, axes = plt.subplots(figsize=(14,6), nrows=1, ncols=3)
    for idx, ax in enumerate(axes.flatten()):
        # Experiment batch
        experiment = f"ABN_B{idx+1}"
        # Load csv data
        df = pd.read_csv(eval(experiment))

        sns.barplot(df, x='ABN', y=f"Fast Swim {position}", ax=ax)
        ax.set_title(f"Experiment {experiment}")

    fig.suptitle(f"Relationship Between Fast Swim {position} and Abnormal Level")
    plt.tight_layout()
    plt.show()

In [None]:
# All experiment
# Position state
positions = ['Btm', 'Mid', 'SUR']

fig, axes = plt.subplots(figsize=(14,6), nrows=1, ncols=len(positions))
for idx, ax in enumerate(axes.flatten()):
    sns.barplot(df_all, x='ABN', y=f"Fast Swim {positions[idx]}", ax=ax)

fig.suptitle(f"Relationship Between Fast Swim Position and Abnormal Level")
plt.tight_layout()
plt.show()

## Relationship beween Burst Swimming and Abnormal Level

- Burst Swimming and Abnormal Level (up trend)

In [None]:
# Plotting Burst Swimming
fig, axes = plt.subplots(figsize=(14,6), nrows=1, ncols=3)
for idx, ax in enumerate(axes.flatten()):
    # Experiment batch
    experiment = f"ABN_B{idx+1}"
    # Load csv data
    df = pd.read_csv(eval(experiment))

    sns.barplot(df, x='ABN', y='Burst Swimming', ax=ax)
    ax.set_title(f"Experiment {experiment}")

fig.suptitle(f"Relationship Between Burst Swimming and Abnormal Level")
plt.tight_layout()
plt.show()

In [None]:
# All experiment
sns.barplot(df_all, x='ABN', y='Burst Swimming')
plt.title("Relationship Between Burst Swimming and Abnormal Level")
plt.show()

## Relationship beween Acc. Ver. Position and Abnormal Level

- Acc. Ver. MID and Abnormal Level (up trend)
- Acc. Ver. SUR and Abnormal Level (up trend)

In [None]:
# Postion state
positions = ['MID', 'SUR']

for position in positions:
    # Plotting hovering
    fig, axes = plt.subplots(figsize=(14,6), nrows=1, ncols=3)
    for idx, ax in enumerate(axes.flatten()):
        # Experiment batch
        experiment = f"ABN_B{idx+1}"
        # Load csv data
        df = pd.read_csv(eval(experiment))

        sns.barplot(df, x="ABN", y=f"Acc. Ver. {position}", ax=ax)
        ax.set_title(f"Experiment {experiment}")

    fig.suptitle(f"Relationship between Acc. Ver. {position} with Abnormal Level")
    plt.tight_layout()
    plt.show()

In [None]:
# All experiment
# Position state
positions = ['MID', 'SUR']

fig, axes = plt.subplots(figsize=(14,6), nrows=1, ncols=len(positions))
for idx, ax in enumerate(axes.flatten()):
    sns.barplot(df_all, x="ABN", y=f"Acc. Ver. {position}", ax=ax)

fig.suptitle(f"Relationship Between Acc. Ver. Position and Abnormal Level")
plt.tight_layout()
plt.show()

## Relationship beween Turning and Abnormal Level
- Turning and Abnormal Level (down trend)

In [None]:
# Plotting Burst Swimming
fig, axes = plt.subplots(figsize=(14,6), nrows=1, ncols=3)
for idx, ax in enumerate(axes.flatten()):
    # Experiment batch
    experiment = f"ABN_B{idx+1}"
    # Load csv data
    df = pd.read_csv(eval(experiment))

    sns.barplot(df, x='ABN', y='Turning', ax=ax)
    ax.set_title(f"Experiment {experiment}")

fig.suptitle(f"Relationship Between Turning and Abnormal Level")
plt.tight_layout()
plt.show()

In [None]:
# All experiment
sns.barplot(df_all, x='ABN', y='Turning')
plt.title("Relationship between Turning and Abnormal Level")
plt.show()

## Relationship beween Agg. Behavior and Abnormal Level

In [None]:
# Plotting Burst Swimming
fig, axes = plt.subplots(figsize=(14,6), nrows=1, ncols=3)
for idx, ax in enumerate(axes.flatten()):
    # Experiment batch
    experiment = f"ABN_B{idx+1}"
    # Load csv data
    df = pd.read_csv(eval(experiment))

    sns.barplot(df, x='ABN', y='Agg. Behaviour', ax=ax)
    ax.set_title(f"Experiment {experiment}")

fig.suptitle(f"Relationship Between Agg. Behaviour and Abnormal Level")
plt.tight_layout()
plt.show()

In [None]:
# All experiment 
sns.barplot(df_all, x='ABN', y='Agg. Behaviour')
plt.title(f"Relationship Between Agg. Behaviour and Abnormal Level")
plt.show()

## Relationship beween Resting and Abnormal Level

In [None]:
# Plotting Burst Swimming
fig, axes = plt.subplots(figsize=(14,6), nrows=1, ncols=3)
for idx, ax in enumerate(axes.flatten()):
    # Experiment batch
    experiment = f"ABN_B{idx+1}"
    # Load csv data
    df = pd.read_csv(eval(experiment))

    sns.barplot(df, x='ABN', y='Resting', ax=ax)
    ax.set_title(f"Experiment {experiment}")

fig.suptitle(f"Relationship Between Resting and Abnormal Level")
plt.tight_layout()
plt.show()

In [None]:
# All experiment
sns.barplot(df_all, x='ABN', y='Resting')
plt.title("Relationship between Resting and Abnormal Level")
plt.show()

## Relationship beween Active and Abnormal Level

In [None]:
# Plotting Burst Swimming
fig, axes = plt.subplots(figsize=(14,6), nrows=1, ncols=3)
for idx, ax in enumerate(axes.flatten()):
    # Experiment batch
    experiment = f"ABN_B{idx+1}"
    # Load csv data
    df = pd.read_csv(eval(experiment))

    sns.barplot(df, x='ABN', y='Active', ax=ax)
    ax.set_title(f"Experiment {experiment}")

fig.suptitle(f"Relationship Between Active and Abnormal Level")
plt.tight_layout()
plt.show()

In [None]:
# All experiment
sns.barplot(df_all, x='ABN', y='Active')
plt.title("Relationship between Active and Abnormal Level")
plt.show()

# Data Cleaning

In [None]:
# Drop nan value
df = df.dropna()
df.head()

In [None]:
# Only use relevent feature
df = df.iloc[:, 3:]
df.head()

In [None]:
# Describe the data
df.describe().T

In [None]:
# Based on my understanding, all data are in continous
X = df.drop(columns=['N', 'ABN', 'ABN%'])
y = df['ABN']

### Feature Selection On Specific Batch

In [None]:
results = []

for i in range(3):
    # Experiment batch
    experiment = f"ABN_B{i+1}"
    # Load csv data
    df = pd.read_csv(eval(experiment))

    # Feature and label
    df = df.iloc[:, 3:].dropna()

    # Resample as some level only has 1
    df = df.groupby('ABN').apply(
        lambda x: x.sample(50, replace=True, random_state=RANDOM_STATE)
    ).reset_index(drop=True)

    # Split to feature and label
    X = df.drop(columns=['N', 'ABN', 'ABN%'])
    y = df['ABN']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

    # Iterate using from one feature to all feature
    for i in range(len(X.columns)):
        # Create the pipeline
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('feature_selection', SelectKBest(score_func=f_classif, k=i+1)),
            ('classifier', LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
        ])
        
        # Fit the pipeline
        pipeline.fit(X_train, y_train)

        # Make predictions
        y_pred = pipeline.predict(X_test)

        result = {
            'data_batch': experiment,
            'number_feature': i+1,
            'accuracy': accuracy_score(y_test, y_pred),
            'precision_weighted': precision_score(y_test, y_pred, average='weighted', zero_division=True), 
            'recall_weighted': recall_score(y_test, y_pred, average='weighted', zero_division=True),
            'f1_weighted': f1_score(y_test, y_pred, average='weighted', zero_division=True)
        }
        
        results.append(result)
        
        # Message
        # print(f"\n--- Inspecting Experiment Batch {experiment} ---")
        # print(f"--- Results --- \n--- Using {i+1} feature ---")
        # print(classification_report(y_test, y_pred, target_names=np.unique(y_train.astype(str)), zero_division=True))

results = pd.DataFrame(results)

In [None]:
results.sort_values('accuracy', ascending=False)

In [None]:
for i in range(3):
    # Experiment batch
    experiment = f"ABN_B{i+1}"
    # Load csv data
    df = pd.read_csv(eval(experiment))

    # Feature and label
    df = df.iloc[:, 3:].dropna()

    # Resample as some level only has 1
    df = df.groupby('ABN').apply(
        lambda x: x.sample(50, replace=True, random_state=RANDOM_STATE)
    ).reset_index(drop=True)

    # Split to feature and label
    X = df.drop(columns=['N', 'ABN', 'ABN%'])
    y = df['ABN']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

    # Initializing and training the Random Forest Classifier
    model = RandomForestClassifier(random_state=RANDOM_STATE)
    model.fit(X_train, y_train)

    # Getting feature importance
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]

    # Plotting the feature importances
    plt.figure(figsize=(12, 6))
    sns.barplot(x=importances[indices], y=np.array(X.columns)[indices], hue=np.array(X.columns)[indices], palette='viridis')
    plt.title(f'Feature Importance {experiment}')
    plt.xlabel('Importance Score')
    plt.ylabel('Features')
    plt.show()

In [None]:
# Experiment batch
experiment = results.loc[results['accuracy'].idxmax()]['data_batch']

# Number feature
n_feature = results.sort_values('accuracy', ascending=False)['number_feature'][0]

# Load csv data
df = pd.read_csv(eval(experiment))

# Message
print(f"\n--- Inspecting Experiment Batch {experiment} ---")

# Feature and label
df = df.iloc[:, 3:].dropna()

# Resample as some level only has 1
df = df.groupby('ABN').apply(
    lambda x: x.sample(50, replace=True, random_state=RANDOM_STATE)
).reset_index(drop=True)

# Split to feature and label
X = df.drop(columns=['N', 'ABN', 'ABN%'])
y = df['ABN']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

# Create the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=n_feature)),
    ('classifier', LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

print(f"--- Results ---")
print(classification_report(y_test, y_pred, target_names=np.unique(y_train.astype(str)), zero_division=True))

In [None]:
# Since we are training with B3, lets test with B1 and B2
X12 = pd.DataFrame()
y12 = pd.DataFrame()

for i in range(2):
    # Experiment batch
    experiment = f"ABN_B{i+1}"
    # Load csv data
    df = pd.read_csv(eval(experiment))

    # Feature and label
    df = df.iloc[:, 3:].dropna()

    # Split to feature and label
    X = df.drop(columns=['N', 'ABN', 'ABN%'])
    y = df['ABN']

    X12 = pd.concat([X12, X], axis=0)
    y12 = pd.concat([y12, y], axis=0)

# Make predictions
y_pred = pipeline.predict(X12)

print(f"--- Results ---")
print(classification_report(y12, y_pred, target_names=np.unique(y_train.astype(str)), zero_division=True))

In [None]:
results = []

for i in [2, 3]:
    # Load batch 1
    df = pd.read_csv(ABN_B1)

    # Experiment batch
    experiment = f"ABN_B{i}"

    # Load csv data
    df = pd.concat([df, pd.read_csv(eval(experiment))], axis=0)

    # Feature and label
    df = df.iloc[:, 3:].dropna()

    # Resample as some level only has 1
    df = df.groupby('ABN').apply(
        lambda x: x.sample(50, replace=True, random_state=RANDOM_STATE)
    ).reset_index(drop=True)

    # Split to feature and label
    X = df.drop(columns=['N', 'ABN', 'ABN%'])
    y = df['ABN']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

    # Iterate using from one feature to all feature
    for i in range(len(X.columns)):
        # Create the pipeline
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('feature_selection', SelectKBest(score_func=f_classif, k=i+1)),
            ('classifier', LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
        ])
        
        # Fit the pipeline
        pipeline.fit(X_train, y_train)

        sel_feat = pipeline.named_steps['feature_selection'].get_feature_names_out(
            input_features=X.columns
        )

        # Make predictions
        y_pred = pipeline.predict(X_test)

        result = {
            'data_batch': experiment,
            'number_feature': i+1,
            'accuracy': accuracy_score(y_test, y_pred),
            'precision_weighted': precision_score(y_test, y_pred, average='weighted', zero_division=True), 
            'recall_weighted': recall_score(y_test, y_pred, average='weighted', zero_division=True),
            'f1_weighted': f1_score(y_test, y_pred, average='weighted', zero_division=True)
        }
        
        results.append(result)

        # Message
        print(f"\n--- Inspecting Experiment Batch {experiment}1 ---")
        print(f"Selected feature: {sel_feat}")
        print(f"--- Results --- \n--- Using {i+1} feature ---")
        print(classification_report(y_test, y_pred, target_names=np.unique(y_train.astype(str)), zero_division=True))

results = pd.DataFrame(results)

In [None]:
results.sort_values('accuracy', ascending=False)

In [None]:
df_train = pd.concat([
    pd.read_csv(ABN_B1).dropna(),
    pd.read_csv(ABN_B3).dropna()
]).reset_index(drop=True)

feature_selected = ['Hovering BTM', 'Fast Swim SUR', 'Acc. Ver. SUR', 'Turning', 'Burst Swimming']

X = df_train[feature_selected]
y = df_train['ABN']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

models = [
    LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
    DecisionTreeClassifier(random_state=RANDOM_STATE),
    RandomForestClassifier(random_state=RANDOM_STATE),
    SVC(random_state=RANDOM_STATE)
]

for model in models:
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])

    # Fit the pipeline
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    print(f"\n--- Results {model} ---")
    print(classification_report(y_test, y_pred, target_names=np.unique(y_train.astype(str)), zero_division=True))

In [None]:
np.arange(2, 20, 2)

In [None]:
# Custom scoring function to prioritize recall for specific classes
def custom_recall_score(y_true, y_pred):
    recalls = recall_score(y_true, y_pred, average=None)
    # return recall for class 10
    return np.mean(recalls[2])

# Create a custom scorer
custom_scorer = make_scorer(custom_recall_score)

# Define the parameter distribution for random search
param_dist = {
    'n_estimators': np.arange(100, 500, 100),
    'min_samples_split': np.arange(2, 20),
    'min_samples_leaf': np.arange(1, 20),
    'max_features': np.random.uniform(0.1, 0.9, 20),  # Continuous uniform distribution
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
}

# Create a random forest classifier
rf = RandomForestClassifier(random_state=RANDOM_STATE)

# Create the random search object
random_search = GridSearchCV(
    estimator=rf,
    param_grid=param_dist,
    cv=2,
    verbose=2,
    n_jobs=-1,
    scoring=custom_scorer
)

# Fit the random search object to the data
random_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)

# Get the best model
best_rf = random_search.best_estimator_

# Evaluate on the test set
test_score = best_rf.score(X_test, y_test)
print("Test set score:", test_score)

recall_10_score = custom_recall_score(y_test, best_rf.predict(X_test))
print(f"Recall 10 score: {recall_10_score}")

# Feature importance
feature_importance = best_rf.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

print("\nTop 5 most important features:")
for i in sorted_idx[-5:]:
    print(f"Feature {i}: {feature_importance[i]:.4f}")

# Calculate mean and standard deviation of trees in the forest
n_trees = best_rf.n_estimators
depths = np.zeros(n_trees)

for i, tree in enumerate(best_rf.estimators_):
    depths[i] = tree.tree_.max_depth

print(f"\nMean tree depth: {np.mean(depths):.2f}")
print(f"Standard deviation of tree depths: {np.std(depths):.2f}")

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', best_rf)
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

print(f"\n--- Results {model} ---")
print(classification_report(y_test, y_pred, target_names=np.unique(y_train.astype(str)), zero_division=True))

In [None]:
# read batch 1
df = pd.read_csv(ABN_B1)

# read combination experiment
experiment = results.sort_values('accuracy', ascending=False)['data_batch'][0]
dfc = pd.read_csv(eval(experiment))

# combine
df = pd.concat([df, dfc])

# Feature and label
df = df.iloc[:, 3:].dropna()

# Resample as some level only has 1
df = df.groupby('ABN').apply(
    lambda x: x.sample(50, replace=True, random_state=RANDOM_STATE)
).reset_index(drop=True)

# Split to feature and label
X = df.drop(columns=['N', 'ABN', 'ABN%'])
y = df['ABN']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

# Create the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=15)),
    ('classifier', LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

print(f"--- Results ---")
print(classification_report(y_test, y_pred, target_names=np.unique(y_train.astype(str)), zero_division=True))

### Feature Selection On All Data

In [None]:
results = []

# Feature and label
df = df_all.iloc[:, 3:].dropna()

# Split to feature and label
X = df.drop(columns=['N', 'ABN', 'ABN%'])
y = df['ABN']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

# Iterate using from one feature to all feature
for i in range(len(X.columns)):
    # Create the pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('feature_selection', SelectKBest(score_func=f_classif, k=i+1)),
        ('classifier', LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
    ])
    
    # Fit the pipeline
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    result = {
        'data_batch': "ABN_B123",
        'number_feature': i+1,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision_weighted': precision_score(y_test, y_pred, average='weighted', zero_division=True), 
        'recall_weighted': recall_score(y_test, y_pred, average='weighted', zero_division=True),
        'f1_weighted': f1_score(y_test, y_pred, average='weighted', zero_division=True)
    }
    
    results.append(result)
    
    # Message
    # print(f"\n--- Inspecting Experiment Batch {experiment} ---")
    # print(f"--- Results --- \n--- Using {i+1} feature ---")
    # print(classification_report(y_test, y_pred, target_names=np.unique(y_train.astype(str)), zero_division=True))

results = pd.DataFrame(results)

In [None]:
results.sort_values('f1_weighted', ascending=False)

In [None]:
# Split to feature and label
X = df.drop(columns=['N', 'ABN', 'ABN%'])
y = df['ABN']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

# Initializing and training the Random Forest Classifier
model = RandomForestClassifier(random_state=RANDOM_STATE)
model.fit(X_train, y_train)

# Getting feature importance
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

# Plotting the feature importances
plt.figure(figsize=(12, 6))
sns.barplot(x=importances[indices], y=np.array(X.columns)[indices], hue=np.array(X.columns)[indices], palette='viridis')
plt.title('Feature Importance')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()

In [None]:
results = []

# Feature and label
df = df_all.iloc[:, 3:].dropna()

# Split to feature and label
X = df[['Turning', 'Burst Swimming']]
y = df['ABN']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE, stratify=y)


# Create the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(random_state=RANDOM_STATE))
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Message
print(f"--- Results ---")
print(classification_report(y_test, y_pred, target_names=np.unique(y_train.astype(str)), zero_division=True))

disp = ConfusionMatrixDisplay.from_predictions(y_test, y_pred, xticks_rotation='vertical')
disp.figure_.suptitle("Confusion Matrix")
plt.show()

In [None]:
y_train.value_counts()

In [None]:
def original_resample(y_train, y_res):
    fig, axs = plt.subplots(ncols=2, figsize=(10, 5))
    autopct = "%.2f"
    y_train.value_counts().plot.pie(autopct=autopct, ax=axs[0])
    axs[0].set_title("Original")
    y_res.value_counts().plot.pie(autopct=autopct, ax=axs[1])
    axs[1].set_title("Resample")
    fig.tight_layout()

In [None]:
N_RESAMPLE = 50
results = []

# Feature and label
df = df_all.iloc[:, 3:].dropna()

# Split to feature and label
X = df.drop(columns=['N', 'ABN', 'ABN%'])
y = df['ABN']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

df_res = df_all.iloc[X_train.index, :].groupby('ABN').apply(
    lambda x: x.sample(N_RESAMPLE, replace=True, random_state=RANDOM_STATE)
).reset_index(drop=True)

df_res = df_res.iloc[:, 3:]

# Split to feature and label
X_res = df_res.drop(columns=['N', 'ABN', 'ABN%'])
y_res = df_res['ABN']


print(f"Original Data Number: {len(X_train)}")
print(f"Resample Data Number: {len(X_res)}")
original_resample(y_train, y_res)

# Iterate using from one feature to all feature
for i in range(len(X.columns)):
    # Create the pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('feature_selection', SelectKBest(score_func=f_classif, k=i+1)),
        ('classifier', LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
    ])
    
    # Fit the pipeline
    pipeline.fit(X_res, y_res)

    sel_feat = pipeline.named_steps['feature_selection'].get_feature_names_out(
        input_features=X.columns
    )

    # Make predictions
    y_pred = pipeline.predict(X_test)

    result = {
        'data_batch': "ABN_B123",
        'number_feature': i+1,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision_weighted': precision_score(y_test, y_pred, average='weighted', zero_division=True), 
        'recall_weighted': recall_score(y_test, y_pred, average='weighted', zero_division=True),
        'f1_weighted': f1_score(y_test, y_pred, average='weighted', zero_division=True)
    }
    
    results.append(result)
    
    # Message
    print(f"\n--- Inspecting Experiment Batch 123 ---")
    print(f"Selected feature: {sel_feat}")
    print(f"--- Results --- \n--- Using {i+1} feature ---")
    print(classification_report(y_test, y_pred, target_names=np.unique(y_train.astype(str)), zero_division=True))

results = pd.DataFrame(results)

In [None]:
results.sort_values('accuracy', ascending=False)

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline

from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

In [None]:
# Feature and label
df = df_all.iloc[:, 3:].dropna()

# Split to feature and label
X = df[['Turning', 'Burst Swimming']]
y = df['ABN']

steps = [
    ('rus_auto', RandomUnderSampler(random_state=RANDOM_STATE)),
    ('model', LogisticRegression(random_state=RANDOM_STATE))
]

pipeline = Pipeline(steps=steps)

# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=RANDOM_STATE)
scores = cross_val_score(pipeline, X, y, scoring='recall_weighted', cv=cv, n_jobs=-1)
print('Mean recall: %.3f' % np.mean(scores))

In [None]:
results = []

# Feature and label
df = df_all.iloc[:, 3:].dropna()

# Split to feature and label
X = df.drop(columns=['N', 'ABN', 'ABN%'])
y = df['ABN']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

rus = RandomUnderSampler(random_state=RANDOM_STATE)

X_res, y_res = rus.fit_resample(X_train, y_train)

print(f"Original Data Number: {len(X_train)}")
print(f"Resample Data Number: {len(X_res)}")
original_resample(y_train, y_res)

# Iterate using from one feature to all feature
for i in range(len(X.columns)):
    # Create the pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('feature_selection', SelectKBest(score_func=f_classif, k=i+1)),
        ('classifier', LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
    ])
    
    # Fit the pipeline
    pipeline.fit(X_res, y_res)

    sel_feat = pipeline.named_steps['feature_selection'].get_feature_names_out(
        input_features=X.columns
    )

    # Make predictions
    y_pred = pipeline.predict(X_test)

    result = {
        'data_batch': "ABN_B123",
        'number_feature': i+1,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision_weighted': precision_score(y_test, y_pred, average='weighted', zero_division=True), 
        'recall_weighted': recall_score(y_test, y_pred, average='weighted', zero_division=True),
        'f1_weighted': f1_score(y_test, y_pred, average='weighted', zero_division=True)
    }
    
    results.append(result)
    
    # Message
    print(f"\n--- Inspecting Experiment Batch 123 ---")
    print(f"Selected feature: {sel_feat}")
    print(f"--- Results --- \n--- Using {i+1} feature ---")
    print(classification_report(y_test, y_pred, target_names=np.unique(y_train.astype(str)), zero_division=True))

results = pd.DataFrame(results)

In [None]:
results.sort_values('accuracy', ascending=False)

In [None]:
results = []

# Feature and label
df = df_all.iloc[:, 3:].dropna()

# Split to feature and label
X = df.drop(columns=['N', 'ABN', 'ABN%'])
y = df['ABN']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

ros = RandomOverSampler(random_state=RANDOM_STATE)

X_res, y_res = ros.fit_resample(X_train, y_train)


print(f"Original Data Number: {len(X_train)}")
print(f"Resample Data Number: {len(X_res)}")
original_resample(y_train, y_res)


# Iterate using from one feature to all feature
for i in range(len(X.columns)):
    # Create the pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('feature_selection', SelectKBest(score_func=f_classif, k=i+1)),
        ('classifier', LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
    ])
    
    # Fit the pipeline
    pipeline.fit(X_res, y_res)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    sel_feat = pipeline.named_steps['feature_selection'].get_feature_names_out(
        input_features=X.columns
    )

    result = {
        'data_batch': "ABN_B123",
        'number_feature': i+1,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision_weighted': precision_score(y_test, y_pred, average='weighted', zero_division=True), 
        'recall_weighted': recall_score(y_test, y_pred, average='weighted', zero_division=True),
        'f1_weighted': f1_score(y_test, y_pred, average='weighted', zero_division=True)
    }
    
    results.append(result)
    
    # Message
    print(f"\n--- Inspecting Experiment Batch 123 ---")
    print(f"Selected feature: {sel_feat}")
    print(f"--- Results --- \n--- Using {i+1} feature ---")
    print(classification_report(y_test, y_pred, target_names=np.unique(y_train.astype(str)), zero_division=True))

results = pd.DataFrame(results)

In [None]:
results.sort_values('accuracy', ascending=False)

In [None]:
df.columns