In [1]:
import cv2
import os
import numpy as np
import matplotlib.pyplot as plt
from ultralytics import YOLO
from keras.models import Sequential, model_from_json, load_model
import glob
import time

In [2]:
def mass_center(img,is_round=True):
    Y = img.mean(axis=1)
    X = img.mean(axis=0)
    Y_ = np.sum(np.arange(Y.shape[0]) * Y)/np.sum(Y)
    X_ = np.sum(np.arange(X.shape[0]) * X)/np.sum(X)
    if is_round:
        return int(round(X_)),int(round(Y_))
    return X_,Y_

def image_extract(img,newsize):
    if (len(np.where(img.mean(axis=0)!=0)[0]) != 0):
        x_s = np.where(img.mean(axis=0)!=0)[0].min()
        x_e = np.where(img.mean(axis=0)!=0)[0].max()
        
        y_s = np.where(img.mean(axis=1)!=0)[0].min()
        y_e = np.where(img.mean(axis=1)!=0)[0].max()
        
        x_c,_ = mass_center(img)
        x_s = x_c-newsize[1]//2
        x_e = x_c+newsize[1]//2
        img = img[y_s:y_e,x_s if x_s>0 else 0:x_e if x_e<img.shape[1] else img.shape[1]]
        return cv2.resize(img,newsize)
    else:
        return 0

In [3]:
lables = {'Abhirami': 0,
            'Aswathy': 1,
            'Ayana': 2,
            'Lekshmi': 3,
            'Nandana': 4,
            'Parthiv': 5,
            'Shilpa': 6,}

In [4]:
vid_file = '../../dataset/aswathy/7.mp4'

vidcap = cv2.VideoCapture(vid_file)
length = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
success,image = vidcap.read()
count = 0
mask_count = 0
thresh = 127
message = 'Analyzing...'
m2 = ' '
fps = 0

pretrained_weight = './yolov8x-seg.pt'
model = YOLO(pretrained_weight)
new_model = load_model('./fine_tuned.h5')

mask_dir = './output/masks/'
mask_files = glob.glob(mask_dir+ '/*')
for f in mask_files:
    os.remove(f)

while success:
    start = time.time()
    if(count%4 == 0):
        results = model(image)
        masks = results[0].masks
        if masks:
            m2 = 'YOLO v8 = Inference: {0} ms | Preprocess: {1} ms | Postprocess: {2} ms' .format(round(results[0].speed['inference'], 2), round(results[0].speed['preprocess'], 2), round(results[0].speed['postprocess'], 2))
            mask_count = mask_count+1
            ms = masks.data.numpy()
            cv2.imwrite(mask_dir + str(count) + '.png', ms[0,:,:]*255)
        
    if(mask_count >= 10):
        image_data = []
        files = os.listdir('./output/masks/')
        for f in files:
            im = cv2.imread('./output/masks/'+'/'+f, 0)
            im_bw = cv2.threshold(im, thresh, 255, cv2.THRESH_BINARY)[1]
            item = image_extract(im_bw, (64, 128))
            if(np.max(item) != 0):
                image_data.append(item)
        gei = np.mean(image_data,axis=0)

        res_img = cv2.resize(gei, (224,224))
        test_img = cv2.merge([res_img, res_img, res_img])
        test_img = test_img/255
        test_img = np.reshape(test_img, (1, 224, 224, 3))
        
        preds = new_model.predict(test_img)
        print(preds)
        prediction = np.argmax(preds)
        print(prediction)
        for name, label in lables.items():
            if label == prediction:
                message = 'Detected : ' + name + ' (' + str(round(preds[0][prediction]*100, 2)) + '% Accuracy)'

    success,image = vidcap.read()
    count += 1

    mask_out = cv2.merge([ms[0,:,:]*255, ms[0,:,:]*255, ms[0,:,:]*255])

    font = cv2.FONT_HERSHEY_SIMPLEX
    org = (25, 35)
    org1 = (25, 60)
    org2 = (25, 330)
    org3 = (495, 330)
    fontScale = 0.8
    fontScale2 = 0.45
    fontScale3 = 0.65
    color = (0, 0, 0)
    color2 = (0, 0, 255)
    color3 = (255, 255, 255)
    color4 = (255, 0, 255)
    color5 = (0, 255, 0)
    thickness = 2
    thickness2 = 1

    title1 = 'Live'
    org4 = (25, 30)
    title2 = 'YOLO v8 generated mask'

    fps_msg = 'FPS: ' + str(round(fps/1000, 2))
    image = cv2.putText(image, fps_msg, org3, font, 
                   fontScale3, color2, thickness, cv2.LINE_AA)
    image = cv2.putText(image, title1, org4, font, 
                   fontScale3, color2, thickness, cv2.LINE_AA)
        
    mask_out = cv2.putText(mask_out, title2, org4, font, 
                   fontScale3, color4, thickness, cv2.LINE_AA)
    
    # mask_out = cv2.putText(mask_out, m2, org2, font, 
    #                 fontScale2, color5, thickness2, cv2.LINE_AA)
    
    prompt = np.zeros((95, 640, 3))*255
    prompt = cv2.putText(prompt, message, org, font, 
                    fontScale, color5, thickness, cv2.LINE_AA)
    prompt = cv2.putText(prompt, m2, org1, font, 
                    fontScale2, color3, thickness2, cv2.LINE_AA)
    if(image is None):
        cv2.destroyAllWindows()
    if(image is not None):
        # cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)/255
        disp = np.concatenate((cv2.resize(image/255, (640, 280)), cv2.resize(mask_out, (640, 280))), axis=0)
        info = np.concatenate((disp, prompt), axis=0)
        cv2.imshow('Live', info)
        key = cv2.waitKey(1)
        if key == 27:
            cv2.destroyAllWindows()
            break
    end = time.time()
    seconds = end - start 
    fps  = length / seconds


0: 352x640 1 person, 971.9ms
Speed: 6.5ms preprocess, 971.9ms inference, 24.0ms postprocess per image at shape (1, 3, 640, 640)

0: 352x640 1 person, 2 handbags, 962.3ms
Speed: 0.0ms preprocess, 962.3ms inference, 8.0ms postprocess per image at shape (1, 3, 640, 640)

0: 352x640 1 person, 1 handbag, 908.3ms
Speed: 0.0ms preprocess, 908.3ms inference, 8.0ms postprocess per image at shape (1, 3, 640, 640)

0: 352x640 1 person, 1 handbag, 901.8ms
Speed: 0.0ms preprocess, 901.8ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 352x640 1 person, 1 handbag, 931.1ms
Speed: 0.0ms preprocess, 931.1ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 352x640 1 person, 1 handbag, 888.4ms
Speed: 0.0ms preprocess, 888.4ms inference, 8.0ms postprocess per image at shape (1, 3, 640, 640)

0: 352x640 1 person, 3 handbags, 928.4ms
Speed: 4.5ms preprocess, 928.4ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 352x640 1 person, 1 backpack, 1

[[ 7.7394e-06   1.519e-07    0.050421  1.5999e-07     0.94957  1.6467e-06  3.4169e-07]]
4
[[ 7.7394e-06   1.519e-07    0.050421  1.5999e-07     0.94957  1.6467e-06  3.4169e-07]]
4
[[ 7.7394e-06   1.519e-07    0.050421  1.5999e-07     0.94957  1.6467e-06  3.4169e-07]]
4





[[ 7.7394e-06   1.519e-07    0.050421  1.5999e-07     0.94957  1.6467e-06  3.4169e-07]]
4


0: 352x640 1 person, 1 handbag, 999.2ms
Speed: 0.0ms preprocess, 999.2ms inference, 4.6ms postprocess per image at shape (1, 3, 640, 640)


[[ 7.9268e-05  0.00057685      0.1002  2.2322e-07     0.89914  5.7563e-07  3.7755e-06]]
4
[[ 7.9268e-05  0.00057685      0.1002  2.2322e-07     0.89914  5.7563e-07  3.7755e-06]]
4
[[ 7.9268e-05  0.00057685      0.1002  2.2322e-07     0.89914  5.7563e-07  3.7755e-06]]
4
[[ 7.9268e-05  0.00057685      0.1002  2.2322e-07     0.89914  5.7563e-07  3.7755e-06]]
4



0: 352x640 1 person, 1 handbag, 895.8ms
Speed: 0.0ms preprocess, 895.8ms inference, 2.6ms postprocess per image at shape (1, 3, 640, 640)


[[ 7.6345e-05  0.00014525    0.071681  1.8485e-07     0.92809  2.4738e-07  4.6869e-06]]
4
[[ 7.6345e-05  0.00014525    0.071681  1.8485e-07     0.92809  2.4738e-07  4.6869e-06]]
4
[[ 7.6345e-05  0.00014525    0.071681  1.8485e-07     0.92809  2.4738e-07  4.6869e-06]]
4
[[ 7.6345e-05  0.00014525    0.071681  1.8485e-07     0.92809  2.4738e-07  4.6869e-06]]
4



0: 352x640 1 person, 886.9ms
Speed: 0.0ms preprocess, 886.9ms inference, 8.0ms postprocess per image at shape (1, 3, 640, 640)


[[ 3.9352e-05  2.4174e-05     0.12066  1.1169e-07     0.87927  4.1283e-07  6.0163e-06]]
4
[[ 3.9352e-05  2.4174e-05     0.12066  1.1169e-07     0.87927  4.1283e-07  6.0163e-06]]
4
[[ 3.9352e-05  2.4174e-05     0.12066  1.1169e-07     0.87927  4.1283e-07  6.0163e-06]]
4





[[ 3.9352e-05  2.4174e-05     0.12066  1.1169e-07     0.87927  4.1283e-07  6.0163e-06]]
4


0: 352x640 1 person, 840.5ms
Speed: 2.6ms preprocess, 840.5ms inference, 8.5ms postprocess per image at shape (1, 3, 640, 640)


[[ 0.00012901  7.4681e-05     0.60797   7.676e-06     0.39144  0.00013249  0.00024995]]
2
[[ 0.00012901  7.4681e-05     0.60797   7.676e-06     0.39144  0.00013249  0.00024995]]
2
[[ 0.00012901  7.4681e-05     0.60797   7.676e-06     0.39144  0.00013249  0.00024995]]
2





[[ 0.00012901  7.4681e-05     0.60797   7.676e-06     0.39144  0.00013249  0.00024995]]
2


0: 352x640 1 person, 872.5ms
Speed: 0.0ms preprocess, 872.5ms inference, 5.0ms postprocess per image at shape (1, 3, 640, 640)


[[ 0.00019385   3.558e-05     0.52138  1.6943e-05     0.47798  0.00021222  0.00018894]]
2
[[ 0.00019385   3.558e-05     0.52138  1.6943e-05     0.47798  0.00021222  0.00018894]]
2
[[ 0.00019385   3.558e-05     0.52138  1.6943e-05     0.47798  0.00021222  0.00018894]]
2





[[ 0.00019385   3.558e-05     0.52138  1.6943e-05     0.47798  0.00021222  0.00018894]]
2


0: 352x640 1 person, 910.7ms
Speed: 4.7ms preprocess, 910.7ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 640)


[[ 0.00012256  0.00053768     0.14848  3.8563e-06     0.84985   8.441e-05   0.0009203]]
4
[[ 0.00012256  0.00053768     0.14848  3.8563e-06     0.84985   8.441e-05   0.0009203]]
4
[[ 0.00012256  0.00053768     0.14848  3.8563e-06     0.84985   8.441e-05   0.0009203]]
4





[[ 0.00012256  0.00053768     0.14848  3.8563e-06     0.84985   8.441e-05   0.0009203]]
4


0: 352x640 1 person, 898.1ms
Speed: 0.0ms preprocess, 898.1ms inference, 8.0ms postprocess per image at shape (1, 3, 640, 640)


[[ 0.00010548  0.00071749     0.24661  5.2604e-06     0.75056  0.00019912   0.0017962]]
4
[[ 0.00010548  0.00071749     0.24661  5.2604e-06     0.75056  0.00019912   0.0017962]]
4
[[ 0.00010548  0.00071749     0.24661  5.2604e-06     0.75056  0.00019912   0.0017962]]
4





[[ 0.00010548  0.00071749     0.24661  5.2604e-06     0.75056  0.00019912   0.0017962]]
4


0: 352x640 1 person, 1 backpack, 927.2ms
Speed: 0.0ms preprocess, 927.2ms inference, 6.4ms postprocess per image at shape (1, 3, 640, 640)


[[ 0.00065457    0.091962     0.13053  1.4634e-05     0.69694    0.052771    0.027126]]
4
[[ 0.00065457    0.091962     0.13053  1.4634e-05     0.69694    0.052771    0.027126]]
4
[[ 0.00065457    0.091962     0.13053  1.4634e-05     0.69694    0.052771    0.027126]]
4
[[ 0.00065457    0.091962     0.13053  1.4634e-05     0.69694    0.052771    0.027126]]
4



0: 352x640 1 person, 1 backpack, 986.2ms
Speed: 0.0ms preprocess, 986.2ms inference, 7.7ms postprocess per image at shape (1, 3, 640, 640)


[[ 9.7986e-05     0.14005    0.064582  3.0734e-06     0.65028    0.040938     0.10405]]
4
[[ 9.7986e-05     0.14005    0.064582  3.0734e-06     0.65028    0.040938     0.10405]]
4
[[ 9.7986e-05     0.14005    0.064582  3.0734e-06     0.65028    0.040938     0.10405]]
4
[[ 9.7986e-05     0.14005    0.064582  3.0734e-06     0.65028    0.040938     0.10405]]
4



0: 352x640 1 person, 1 backpack, 883.0ms
Speed: 0.0ms preprocess, 883.0ms inference, 8.6ms postprocess per image at shape (1, 3, 640, 640)


[[ 2.4086e-05     0.33518    0.020065  1.8473e-06     0.28411    0.079359     0.28126]]
1
[[ 2.4086e-05     0.33518    0.020065  1.8473e-06     0.28411    0.079359     0.28126]]
1
[[ 2.4086e-05     0.33518    0.020065  1.8473e-06     0.28411    0.079359     0.28126]]
1





[[ 2.4086e-05     0.33518    0.020065  1.8473e-06     0.28411    0.079359     0.28126]]
1


0: 352x640 1 person, 1 backpack, 941.9ms
Speed: 0.0ms preprocess, 941.9ms inference, 4.3ms postprocess per image at shape (1, 3, 640, 640)


[[ 6.7868e-06     0.30476   0.0087531  7.9897e-08     0.46447    0.056194     0.16582]]
4
[[ 6.7868e-06     0.30476   0.0087531  7.9897e-08     0.46447    0.056194     0.16582]]
4
[[ 6.7868e-06     0.30476   0.0087531  7.9897e-08     0.46447    0.056194     0.16582]]
4
[[ 6.7868e-06     0.30476   0.0087531  7.9897e-08     0.46447    0.056194     0.16582]]
4



0: 352x640 1 person, 1 backpack, 899.2ms
Speed: 4.7ms preprocess, 899.2ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 640)


[[ 4.1707e-06      0.3558   0.0095978  1.3413e-07     0.13922     0.21498     0.28039]]
1
[[ 4.1707e-06      0.3558   0.0095978  1.3413e-07     0.13922     0.21498     0.28039]]
1
[[ 4.1707e-06      0.3558   0.0095978  1.3413e-07     0.13922     0.21498     0.28039]]
1
[[ 4.1707e-06      0.3558   0.0095978  1.3413e-07     0.13922     0.21498     0.28039]]
1



0: 352x640 1 person, 1 handbag, 860.6ms
Speed: 0.0ms preprocess, 860.6ms inference, 5.7ms postprocess per image at shape (1, 3, 640, 640)


[[ 1.5336e-06    0.040051  0.00022927  1.1685e-07   0.0030678     0.91534    0.041311]]
5
[[ 1.5336e-06    0.040051  0.00022927  1.1685e-07   0.0030678     0.91534    0.041311]]
5
[[ 1.5336e-06    0.040051  0.00022927  1.1685e-07   0.0030678     0.91534    0.041311]]
5
[[ 1.5336e-06    0.040051  0.00022927  1.1685e-07   0.0030678     0.91534    0.041311]]





5


0: 352x640 1 person, 1 backpack, 906.1ms
Speed: 0.0ms preprocess, 906.1ms inference, 5.0ms postprocess per image at shape (1, 3, 640, 640)


[[ 5.2808e-05     0.24006   0.0096156  5.7298e-06    0.029353      0.6625    0.058413]]
5
[[ 5.2808e-05     0.24006   0.0096156  5.7298e-06    0.029353      0.6625    0.058413]]
5
[[ 5.2808e-05     0.24006   0.0096156  5.7298e-06    0.029353      0.6625    0.058413]]
5
[[ 5.2808e-05     0.24006   0.0096156  5.7298e-06    0.029353      0.6625    0.058413]]





5


0: 352x640 1 person, 847.1ms
Speed: 0.0ms preprocess, 847.1ms inference, 8.0ms postprocess per image at shape (1, 3, 640, 640)


[[ 2.9867e-06   0.0033415    0.010116  2.3021e-06    0.014209     0.67784     0.29449]]
5
[[ 2.9867e-06   0.0033415    0.010116  2.3021e-06    0.014209     0.67784     0.29449]]
5
[[ 2.9867e-06   0.0033415    0.010116  2.3021e-06    0.014209     0.67784     0.29449]]
5
