We basically will:
1. load all the images with their label according to the name of their folder
2. Detect the person from each image, we skip image if no person detected
3. Resize images to 128x128 pixels
4. Extract features from each image using VGG16 model
5. Save images (features), and their labels (target class)

In [1]:
import os
import cv2
import numpy as np
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from ultralytics import YOLO
import joblib

# --- 1. Data Loading and Feature Extraction ---

dataset_dir = 'dataset'  # Replace with your dataset path
img_size = (128, 128) 

# Load YOLOv8 model 
model_yolo = YOLO('yolov8m.pt') 

def load_data(dir_path):
    """Loads images and labels from directory structure, cropping people with YOLO."""
    images = []
    labels = []
    class_names = sorted(os.listdir(dir_path))
    for i, class_name in enumerate(class_names):
        class_dir = os.path.join(dir_path, class_name)
        for img_name in os.listdir(class_dir):
            img_path = os.path.join(class_dir, img_name)

            # Load image
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 

            # Object Detection with YOLOv8
            results = model_yolo(img) 

            # Get bounding box coordinates of the person
            x1, y1, x2, y2 = 0, 0, 0, 0 # Initialize coordinates 
            for r in results:
                boxes = r.boxes.xyxy.tolist()  # Bounding boxes
                classes = r.boxes.cls.tolist()  # Class indices
                for box, cls in zip(boxes, classes):
                    if cls == 0:  # Assuming 'person' is class 0 
                        x1, y1, x2, y2 = map(int, box) 
                        break  # Assuming only one person per image 

            # Square Cropping
            if x1 == 0 and y1 == 0 and x2 == 0 and y2 == 0:
                print(f"Warning: Person not detected in {img_path}. Skipping image.")
                continue  # Skip to the next image

            center_x = (x1 + x2) // 2
            center_y = (y1 + y2) // 2
            crop_size = max(x2 - x1, y2 - y1)  # Size of the larger side

            crop_x1 = max(0, center_x - crop_size // 2)
            crop_y1 = max(0, center_y - crop_size // 2)
            crop_x2 = min(img.shape[1], crop_x1 + crop_size)
            crop_y2 = min(img.shape[0], crop_y1 + crop_size)

            cropped_img = img[crop_y1:crop_y2, crop_x1:crop_x2]

            # Resize cropped image to desired size
            cropped_img = cv2.resize(cropped_img, img_size)

            # --- Preprocessing for VGG16 ---
            cropped_img = image.img_to_array(cropped_img) # Convert to array
            cropped_img = preprocess_input(cropped_img) 
            images.append(cropped_img)
            labels.append(i) 
        print(f"{class_name} folder is finished \n---------------------------------------------------------------------------------------------------------------------------------------\n---------------------------------------------------------------------------------------------------------------------------------------")
    
    return np.array(images), np.array(labels), class_names

In [2]:
# Load data 
X, y, class_names = load_data(dataset_dir)
print(f"Number of images {len(X)}")
print(f"Number of labels {len(y)}")



0: 384x640 1 person, 1 vase, 921.4ms
Speed: 11.8ms preprocess, 921.4ms inference, 1228.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 vase, 849.2ms
Speed: 7.5ms preprocess, 849.2ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 vase, 895.5ms
Speed: 3.0ms preprocess, 895.5ms inference, 3.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 vase, 752.5ms
Speed: 3.0ms preprocess, 752.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 vase, 826.8ms
Speed: 2.0ms preprocess, 826.8ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 vase, 762.5ms
Speed: 2.0ms preprocess, 762.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 vase, 675.7ms
Speed: 3.0ms preprocess, 675.7ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 vase, 638.6ms
Speed:

Let's create our feature extracting VGG16 model:

In [3]:
# Load VGG16 model (without top)
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(img_size[0], img_size[1], 3)) 

# Extract features
features = base_model.predict(X) 
features = features.reshape(features.shape[0], -1) 

[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m232s[0m 916ms/step


Since some classes have more than other classes, we might want to balance all the classes so there is no bias towards one over another!

In [4]:
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=42)
X_balanced, y_balanced = smote_enn.fit_resample(X, y)

Let's split our data into training and testing set:

In [5]:
# --- 2. Data Splitting and Scaling ---

X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.3, shuffle=True,random_state=42)

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [6]:
print(f"{X_train.shape}, {X_test.shape}")

(14004, 8192), (6002, 8192)


We tried to scale data but apparently it doesn't make any difference!

### SVM

In [7]:
# --- 3. Model Training (SVM) ---

model = SVC(kernel='rbf', C=1.0, random_state=42) 
model.fit(X_train, y_train)

# --- 4. Model Evaluation ---

y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=class_names)) 
print(confusion_matrix(y_test, y_pred)) 

Accuracy: 1.0
                  precision    recall  f1-score   support

      distracted       1.00      1.00      1.00       837
         fatigue       1.00      1.00      1.00       853
         focused       1.00      1.00      1.00       887
      raise_hand       1.00      1.00      1.00       848
        sleeping       1.00      1.00      1.00       892
using_smartphone       1.00      1.00      1.00       865
 writing_reading       1.00      1.00      1.00       820

        accuracy                           1.00      6002
       macro avg       1.00      1.00      1.00      6002
    weighted avg       1.00      1.00      1.00      6002

[[837   0   0   0   0   0   0]
 [  0 853   0   0   0   0   0]
 [  0   0 887   0   0   0   0]
 [  0   0   0 848   0   0   0]
 [  0   0   0   0 892   0   0]
 [  0   0   0   0   0 865   0]
 [  0   0   0   0   0   0 820]]


### KNN

In [8]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)  # Choose a value for k
knn_model.fit(X_train, y_train) 

# ... (Evaluate the model)
y_pred_knn = knn_model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn, target_names=class_names)) 
print(confusion_matrix(y_test, y_pred_knn)) 

Accuracy: 1.0
                  precision    recall  f1-score   support

      distracted       1.00      1.00      1.00       837
         fatigue       1.00      1.00      1.00       853
         focused       1.00      1.00      1.00       887
      raise_hand       1.00      1.00      1.00       848
        sleeping       1.00      1.00      1.00       892
using_smartphone       1.00      1.00      1.00       865
 writing_reading       1.00      1.00      1.00       820

        accuracy                           1.00      6002
       macro avg       1.00      1.00      1.00      6002
    weighted avg       1.00      1.00      1.00      6002

[[837   0   0   0   0   0   0]
 [  0 853   0   0   0   0   0]
 [  0   0 887   0   0   0   0]
 [  0   0   0 848   0   0   0]
 [  0   0   0   0 892   0   0]
 [  0   0   0   0   0 865   0]
 [  0   0   0   0   0   0 820]]


### Decision Tree

In [9]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
# ... (Evaluate the model)
y_pred_dt = dt_model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt, target_names=class_names)) 
print(confusion_matrix(y_test, y_pred_dt)) 

Accuracy: 0.9968343885371543
                  precision    recall  f1-score   support

      distracted       1.00      1.00      1.00       837
         fatigue       1.00      1.00      1.00       853
         focused       0.99      1.00      0.99       887
      raise_hand       1.00      1.00      1.00       848
        sleeping       1.00      0.99      1.00       892
using_smartphone       1.00      1.00      1.00       865
 writing_reading       1.00      1.00      1.00       820

        accuracy                           1.00      6002
       macro avg       1.00      1.00      1.00      6002
    weighted avg       1.00      1.00      1.00      6002

[[834   0   3   0   0   0   0]
 [  0 851   0   1   0   1   0]
 [  1   0 885   0   0   0   1]
 [  0   0   1 846   0   0   1]
 [  0   4   2   0 885   1   0]
 [  2   0   0   0   0 863   0]
 [  0   0   1   0   0   0 819]]


### Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # Adjust n_estimators as needed
rf_model.fit(X_train, y_train)
# ... (Evaluate the model)
y_pred_rf = rf_model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names=class_names)) 
print(confusion_matrix(y_test, y_pred_rf)) 

Accuracy: 1.0
                  precision    recall  f1-score   support

      distracted       1.00      1.00      1.00       837
         fatigue       1.00      1.00      1.00       853
         focused       1.00      1.00      1.00       887
      raise_hand       1.00      1.00      1.00       848
        sleeping       1.00      1.00      1.00       892
using_smartphone       1.00      1.00      1.00       865
 writing_reading       1.00      1.00      1.00       820

        accuracy                           1.00      6002
       macro avg       1.00      1.00      1.00      6002
    weighted avg       1.00      1.00      1.00      6002

[[837   0   0   0   0   0   0]
 [  0 853   0   0   0   0   0]
 [  0   0 887   0   0   0   0]
 [  0   0   0 848   0   0   0]
 [  0   0   0   0 892   0   0]
 [  0   0   0   0   0 865   0]
 [  0   0   0   0   0   0 820]]


### Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(random_state=42) 
lr_model.fit(X_train, y_train)
# ... (Evaluate the model)
y_pred_lr = lr_model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, target_names=class_names)) 
print(confusion_matrix(y_test, y_pred_lr)) 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 1.0
                  precision    recall  f1-score   support

      distracted       1.00      1.00      1.00       837
         fatigue       1.00      1.00      1.00       853
         focused       1.00      1.00      1.00       887
      raise_hand       1.00      1.00      1.00       848
        sleeping       1.00      1.00      1.00       892
using_smartphone       1.00      1.00      1.00       865
 writing_reading       1.00      1.00      1.00       820

        accuracy                           1.00      6002
       macro avg       1.00      1.00      1.00      6002
    weighted avg       1.00      1.00      1.00      6002

[[837   0   0   0   0   0   0]
 [  0 853   0   0   0   0   0]
 [  0   0 887   0   0   0   0]
 [  0   0   0 848   0   0   0]
 [  0   0   0   0 892   0   0]
 [  0   0   0   0   0 865   0]
 [  0   0   0   0   0   0 820]]


### Evaluate on unseen data:

In [32]:
# --- 5. Prediction on a New Image ---


base_model = VGG16(weights='imagenet', include_top=False, input_shape=(img_size[0], img_size[1], 3)) 
    
def predict_image(image_path, model, class_names):
    """Predicts the class of a single image."""
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 

    # Object Detection with YOLOv8
    results = model_yolo(img) 

     # Get bounding box coordinates of the person
    x1, y1, x2, y2 = 0, 0, 0, 0 # Initialize coordinates 
    for r in results:
        boxes = r.boxes.xyxy.tolist()  # Bounding boxes
        classes = r.boxes.cls.tolist()  # Class indices
        for box, cls in zip(boxes, classes):
            if cls == 0:  # Assuming 'person' is class 0 
                x1, y1, x2, y2 = map(int, box) 
                break  # Assuming only one person per image 

    # Square Cropping 
    center_x = (x1 + x2) // 2
    center_y = (y1 + y2) // 2
    crop_size = max(x2 - x1, y2 - y1)  # Size of the larger side

    crop_x1 = max(0, center_x - crop_size // 2)
    crop_y1 = max(0, center_y - crop_size // 2)
    crop_x2 = min(img.shape[1], crop_x1 + crop_size)
    crop_y2 = min(img.shape[0], crop_y1 + crop_size)

    cropped_img = img[crop_y1:crop_y2, crop_x1:crop_x2]

    # Resize cropped image to desired size
    cropped_img = cv2.resize(cropped_img, img_size)
    # Preprocess for VGG16
    cropped_img = image.img_to_array(cropped_img) 
    cropped_img = preprocess_input(cropped_img)
    cropped_img = np.expand_dims(cropped_img, axis=0) 

    # Load VGG16 model (without top)
    features = base_model.predict(cropped_img)
    features = features.reshape(features.shape[0], -1) 
    # features = scaler.transform(features)

    prediction = model.predict(features)[0]  
    predicted_class = class_names[prediction] 

    return predicted_class

new_image_path = 'uploads/bored2.jpg' # Replace with your image path
predicted_class = predict_image(new_image_path, lr_model, class_names)
print("Predicted Class:", predicted_class)


0: 384x640 1 person, 858.1ms
Speed: 2.0ms preprocess, 858.1ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 248ms/step
Predicted Class: fatigue


I tried all the possible ways that i could thing of to have a good accuracy of the model, and making sure that the model is performing well on unseen data, but I always fell into overfitting again. In this notebook I found out that even when I had overfitting, I noticed that logistic regression model and svm model are actually smart and perform well across majority of labels except for "using phone" and "raising hand".

In [33]:
joblib.dump(model, "svm_3.pkl") # kinda good
joblib.dump(knn_model, "knn_3.pkl") #dumb
joblib.dump(dt_model, "dt_3.pkl") # Very stupid
joblib.dump(lr_model, "lr_3.pkl") # good to be fair
joblib.dump(rf_model, "rf_3.pkl") # bad

['rf_3.pkl']