In [416]:
import cv2
import os
import numpy as np
import matplotlib.pyplot as plt
from ultralytics import YOLO
from keras.models import Sequential, model_from_json, load_model
import glob
import time

In [417]:
def mass_center(img,is_round=True):
    Y = img.mean(axis=1)
    X = img.mean(axis=0)
    Y_ = np.sum(np.arange(Y.shape[0]) * Y)/np.sum(Y)
    X_ = np.sum(np.arange(X.shape[0]) * X)/np.sum(X)
    if is_round:
        return int(round(X_)),int(round(Y_))
    return X_,Y_

def image_extract(img,newsize):
    if (len(np.where(img.mean(axis=0)!=0)[0]) != 0):
        x_s = np.where(img.mean(axis=0)!=0)[0].min()
        x_e = np.where(img.mean(axis=0)!=0)[0].max()
        
        y_s = np.where(img.mean(axis=1)!=0)[0].min()
        y_e = np.where(img.mean(axis=1)!=0)[0].max()
        
        x_c,_ = mass_center(img)
        x_s = x_c-newsize[1]//2
        x_e = x_c+newsize[1]//2
        img = img[y_s:y_e,x_s if x_s>0 else 0:x_e if x_e<img.shape[1] else img.shape[1]]
        return cv2.resize(img,newsize)
    else:
        return 0

In [418]:
lables = {'Abhirami': 0,
            'Aswathy': 1,
            'Ayana': 2,
            'Lekshmi': 3,
            'Nandana': 4,
            'Shilpa': 5}

In [419]:
vid_file = './dataset/abhirami/6.mp4'

vidcap = cv2.VideoCapture(vid_file)
length = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
success,image = vidcap.read()
count = 0
mask_count = 0
thresh = 127
message = 'Analyzing...'
m2 = ' '
fps = 0

pretrained_weight = './yolov8x-seg.pt'
model = YOLO(pretrained_weight)
new_model = load_model('./fine_tuned.h5')

mask_dir = './output/masks/'
mask_files = glob.glob(mask_dir+ '/*')
for f in mask_files:
    os.remove(f)

while success:
    start = time.time()
    if(count%4 == 0):
        results = model(image)
        masks = results[0].masks
        if masks:
            m2 = 'YOLO v8 = Inference: {0} ms | Preprocess: {1} ms | Postprocess: {2} ms' .format(round(results[0].speed['inference'], 2), round(results[0].speed['preprocess'], 2), round(results[0].speed['postprocess'], 2))
            mask_count = mask_count+1
            ms = masks.data.numpy()
            cv2.imwrite(mask_dir + str(count) + '.png', ms[0,:,:]*255)
        
    if(mask_count >= 10):
        image_data = []
        files = os.listdir('./output/masks/')
        for f in files:
            im = cv2.imread('./output/masks/'+'/'+f, 0)
            im_bw = cv2.threshold(im, thresh, 255, cv2.THRESH_BINARY)[1]
            item = image_extract(im_bw, (64, 128))
            if(np.max(item) != 0):
                image_data.append(item)
        gei = np.mean(image_data,axis=0)

        res_img = cv2.resize(gei, (224,224))
        test_img = cv2.merge([res_img, res_img, res_img])
        test_img = test_img/255
        test_img = np.reshape(test_img, (1, 224, 224, 3))
        
        preds = new_model.predict(test_img)
        print(preds)
        prediction = np.argmax(preds)
        print(prediction)
        for name, label in lables.items():
            if label == prediction:
                message = 'Detected : ' + name + ' (' + str(round(preds[0][prediction]*100, 2)) + '% Accuracy)'

    success,image = vidcap.read()
    count += 1

    mask_out = cv2.merge([ms[0,:,:]*255, ms[0,:,:]*255, ms[0,:,:]*255])

    font = cv2.FONT_HERSHEY_SIMPLEX
    org = (25, 35)
    org1 = (25, 60)
    org2 = (25, 330)
    org3 = (495, 320)
    fontScale = 0.8
    fontScale2 = 0.45
    fontScale3 = 0.65
    color = (0, 0, 0)
    color2 = (0, 0, 255)
    color3 = (255, 255, 255)
    color4 = (255, 0, 255)
    color5 = (0, 255, 0)
    thickness = 2
    thickness2 = 1

    title1 = 'Live'
    org4 = (25, 30)
    title2 = 'YOLO v8 generated mask'

    fps_msg = 'FPS: ' + str(round(fps/1000, 2))
    image = cv2.putText(image, fps_msg, org3, font, 
                   fontScale3, color2, thickness, cv2.LINE_AA)
    image = cv2.putText(image, title1, org4, font, 
                   fontScale3, color2, thickness, cv2.LINE_AA)
    
    mask_out = cv2.putText(mask_out, title2, org4, font, 
                   fontScale3, color4, thickness, cv2.LINE_AA)
    
    # mask_out = cv2.putText(mask_out, m2, org2, font, 
    #                 fontScale2, color5, thickness2, cv2.LINE_AA)
    
    prompt = np.zeros((95, 640, 3))*255
    prompt = cv2.putText(prompt, message, org, font, 
                    fontScale, color5, thickness, cv2.LINE_AA)
    prompt = cv2.putText(prompt, m2, org1, font, 
                    fontScale2, color3, thickness2, cv2.LINE_AA)
    if(image is None):
        cv2.destroyAllWindows()
    if(image is not None):
        # cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)/255
        disp = np.concatenate((image/255, mask_out), axis=0)
        info = np.concatenate((disp, prompt), axis=0)
        cv2.imshow('Live', info)
        key = cv2.waitKey(1)
        if key == 27:
            cv2.destroyAllWindows()
            break
    end = time.time()
    seconds = end - start 
    fps  = length / seconds


0: 352x640 1 person, 936.1ms
Speed: 4.0ms preprocess, 936.1ms inference, 25.8ms postprocess per image at shape (1, 3, 640, 640)

0: 352x640 1 person, 805.4ms
Speed: 1.0ms preprocess, 805.4ms inference, 4.4ms postprocess per image at shape (1, 3, 640, 640)

0: 352x640 1 person, 799.4ms
Speed: 1.0ms preprocess, 799.4ms inference, 5.0ms postprocess per image at shape (1, 3, 640, 640)

0: 352x640 1 person, 763.3ms
Speed: 1.5ms preprocess, 763.3ms inference, 4.5ms postprocess per image at shape (1, 3, 640, 640)

0: 352x640 1 person, 757.3ms
Speed: 0.9ms preprocess, 757.3ms inference, 4.5ms postprocess per image at shape (1, 3, 640, 640)

0: 352x640 1 person, 773.5ms
Speed: 2.0ms preprocess, 773.5ms inference, 5.0ms postprocess per image at shape (1, 3, 640, 640)

0: 352x640 1 person, 791.4ms
Speed: 1.0ms preprocess, 791.4ms inference, 6.3ms postprocess per image at shape (1, 3, 640, 640)

0: 352x640 1 person, 838.5ms
Speed: 0.9ms preprocess, 838.5ms inference, 4.9ms postprocess per image a

[[     0.1063  1.8704e-12  1.0976e-07     0.18573     0.70795  1.1917e-05]]
4
[[     0.1063  1.8704e-12  1.0976e-07     0.18573     0.70795  1.1917e-05]]
4
[[     0.1063  1.8704e-12  1.0976e-07     0.18573     0.70795  1.1917e-05]]
4





[[     0.1063  1.8704e-12  1.0976e-07     0.18573     0.70795  1.1917e-05]]
4


0: 352x640 1 person, 1 toilet, 804.8ms
Speed: 1.9ms preprocess, 804.8ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)


[[   0.030424  2.0513e-13  4.9595e-07    0.081139     0.88844  5.2758e-07]]
4
[[   0.030424  2.0513e-13  4.9595e-07    0.081139     0.88844  5.2758e-07]]
4
[[   0.030424  2.0513e-13  4.9595e-07    0.081139     0.88844  5.2758e-07]]
4





[[   0.030424  2.0513e-13  4.9595e-07    0.081139     0.88844  5.2758e-07]]
4


0: 352x640 1 person, 820.6ms
Speed: 1.0ms preprocess, 820.6ms inference, 5.6ms postprocess per image at shape (1, 3, 640, 640)


[[    0.46996   1.359e-13  1.0271e-06     0.38654      0.1435  9.4892e-07]]
0
[[    0.46996   1.359e-13  1.0271e-06     0.38654      0.1435  9.4892e-07]]
0
[[    0.46996   1.359e-13  1.0271e-06     0.38654      0.1435  9.4892e-07]]
0





[[    0.46996   1.359e-13  1.0271e-06     0.38654      0.1435  9.4892e-07]]
0


0: 352x640 1 person, 840.4ms
Speed: 1.0ms preprocess, 840.4ms inference, 4.0ms postprocess per image at shape (1, 3, 640, 640)


[[    0.76241  9.1643e-14  2.2197e-07     0.18232    0.055268  3.8528e-07]]
0
[[    0.76241  9.1643e-14  2.2197e-07     0.18232    0.055268  3.8528e-07]]
0
[[    0.76241  9.1643e-14  2.2197e-07     0.18232    0.055268  3.8528e-07]]
0





[[    0.76241  9.1643e-14  2.2197e-07     0.18232    0.055268  3.8528e-07]]
0


0: 352x640 1 person, 1 toilet, 786.7ms
Speed: 1.0ms preprocess, 786.7ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)


[[    0.91145  1.1973e-13  1.6256e-07    0.087342    0.001208  1.3158e-06]]
0
[[    0.91145  1.1973e-13  1.6256e-07    0.087342    0.001208  1.3158e-06]]
0
[[    0.91145  1.1973e-13  1.6256e-07    0.087342    0.001208  1.3158e-06]]
0





[[    0.91145  1.1973e-13  1.6256e-07    0.087342    0.001208  1.3158e-06]]
0


0: 352x640 1 person, 767.3ms
Speed: 1.0ms preprocess, 767.3ms inference, 4.0ms postprocess per image at shape (1, 3, 640, 640)


[[    0.78453  9.1697e-14  4.3079e-07     0.21397   0.0014967  6.2597e-06]]
0
[[    0.78453  9.1697e-14  4.3079e-07     0.21397   0.0014967  6.2597e-06]]
0
[[    0.78453  9.1697e-14  4.3079e-07     0.21397   0.0014967  6.2597e-06]]
0





[[    0.78453  9.1697e-14  4.3079e-07     0.21397   0.0014967  6.2597e-06]]
0


0: 352x640 1 person, 851.8ms
Speed: 1.0ms preprocess, 851.8ms inference, 5.0ms postprocess per image at shape (1, 3, 640, 640)


[[    0.68582  3.2348e-13  6.9431e-07     0.31281   0.0013553  1.1864e-05]]
0
[[    0.68582  3.2348e-13  6.9431e-07     0.31281   0.0013553  1.1864e-05]]
0
[[    0.68582  3.2348e-13  6.9431e-07     0.31281   0.0013553  1.1864e-05]]
0





[[    0.68582  3.2348e-13  6.9431e-07     0.31281   0.0013553  1.1864e-05]]
0


0: 352x640 1 person, 822.1ms
Speed: 2.0ms preprocess, 822.1ms inference, 5.0ms postprocess per image at shape (1, 3, 640, 640)


[[    0.88948  1.0668e-12  1.3243e-07     0.10767   0.0028491  4.5915e-06]]
0
[[    0.88948  1.0668e-12  1.3243e-07     0.10767   0.0028491  4.5915e-06]]
0
[[    0.88948  1.0668e-12  1.3243e-07     0.10767   0.0028491  4.5915e-06]]
0





[[    0.88948  1.0668e-12  1.3243e-07     0.10767   0.0028491  4.5915e-06]]
0


0: 352x640 1 person, 720.6ms
Speed: 1.0ms preprocess, 720.6ms inference, 4.0ms postprocess per image at shape (1, 3, 640, 640)


[[    0.94486  5.9661e-13  2.8035e-08    0.054316  0.00081673  4.6528e-06]]
0
[[    0.94486  5.9661e-13  2.8035e-08    0.054316  0.00081673  4.6528e-06]]
0
[[    0.94486  5.9661e-13  2.8035e-08    0.054316  0.00081673  4.6528e-06]]
0





[[    0.94486  5.9661e-13  2.8035e-08    0.054316  0.00081673  4.6528e-06]]
0


0: 352x640 1 person, 1 toilet, 752.6ms
Speed: 1.0ms preprocess, 752.6ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)


[[    0.92092  5.1717e-13  1.8239e-07    0.076437   0.0026134  2.7625e-05]]
0
[[    0.92092  5.1717e-13  1.8239e-07    0.076437   0.0026134  2.7625e-05]]
0
[[    0.92092  5.1717e-13  1.8239e-07    0.076437   0.0026134  2.7625e-05]]
0





[[    0.92092  5.1717e-13  1.8239e-07    0.076437   0.0026134  2.7625e-05]]
0


0: 352x640 1 person, 1 toilet, 808.4ms
Speed: 1.0ms preprocess, 808.4ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)


[[    0.93326  7.9525e-13  1.0069e-07    0.065926  0.00079135  1.9198e-05]]
0
[[    0.93326  7.9525e-13  1.0069e-07    0.065926  0.00079135  1.9198e-05]]
0
[[    0.93326  7.9525e-13  1.0069e-07    0.065926  0.00079135  1.9198e-05]]
0





[[    0.93326  7.9525e-13  1.0069e-07    0.065926  0.00079135  1.9198e-05]]
0


0: 352x640 1 person, 1 toilet, 790.8ms
Speed: 1.5ms preprocess, 790.8ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)


[[    0.95674  5.4993e-13  1.1077e-07    0.042468  0.00078369  1.0959e-05]]
0
[[    0.95674  5.4993e-13  1.1077e-07    0.042468  0.00078369  1.0959e-05]]
0
[[    0.95674  5.4993e-13  1.1077e-07    0.042468  0.00078369  1.0959e-05]]
0
[[    0.95674  5.4993e-13  1.1077e-07    0.042468  0.00078369  1.0959e-05]]
0



0: 352x640 1 person, 784.4ms
Speed: 1.0ms preprocess, 784.4ms inference, 6.0ms postprocess per image at shape (1, 3, 640, 640)


[[    0.89454  1.0255e-12  3.7735e-07     0.10419   0.0012664  3.4893e-06]]
0
[[    0.89454  1.0255e-12  3.7735e-07     0.10419   0.0012664  3.4893e-06]]
0
[[    0.89454  1.0255e-12  3.7735e-07     0.10419   0.0012664  3.4893e-06]]
0





[[    0.89454  1.0255e-12  3.7735e-07     0.10419   0.0012664  3.4893e-06]]
0


0: 352x640 1 person, 768.3ms
Speed: 1.0ms preprocess, 768.3ms inference, 5.0ms postprocess per image at shape (1, 3, 640, 640)


[[    0.91972  1.2265e-12  1.0586e-06    0.078483   0.0017994  1.8638e-06]]
0
[[    0.91972  1.2265e-12  1.0586e-06    0.078483   0.0017994  1.8638e-06]]
0
[[    0.91972  1.2265e-12  1.0586e-06    0.078483   0.0017994  1.8638e-06]]
0
[[    0.91972  1.2265e-12  1.0586e-06    0.078483   0.0017994  1.8638e-06]]
0



0: 352x640 1 person, 751.7ms
Speed: 0.0ms preprocess, 751.7ms inference, 4.3ms postprocess per image at shape (1, 3, 640, 640)


[[    0.93022  1.0738e-11  2.3277e-06     0.04965    0.020124  1.8448e-06]]
0
[[    0.93022  1.0738e-11  2.3277e-06     0.04965    0.020124  1.8448e-06]]
0
[[    0.93022  1.0738e-11  2.3277e-06     0.04965    0.020124  1.8448e-06]]
0
[[    0.93022  1.0738e-11  2.3277e-06     0.04965    0.020124  1.8448e-06]]
0



0: 352x640 1 person, 746.4ms
Speed: 1.0ms preprocess, 746.4ms inference, 5.0ms postprocess per image at shape (1, 3, 640, 640)


[[    0.90628  5.8597e-12  5.3913e-07    0.081508    0.012214  4.4261e-07]]
0
[[    0.90628  5.8597e-12  5.3913e-07    0.081508    0.012214  4.4261e-07]]
0
[[    0.90628  5.8597e-12  5.3913e-07    0.081508    0.012214  4.4261e-07]]
0
[[    0.90628  5.8597e-12  5.3913e-07    0.081508    0.012214  4.4261e-07]]
0



0: 352x640 1 person, 787.4ms
Speed: 2.0ms preprocess, 787.4ms inference, 4.0ms postprocess per image at shape (1, 3, 640, 640)


[[    0.87683  1.1762e-11  4.8893e-07    0.081945    0.041225  2.8625e-07]]
0
[[    0.87683  1.1762e-11  4.8893e-07    0.081945    0.041225  2.8625e-07]]
0
[[    0.87683  1.1762e-11  4.8893e-07    0.081945    0.041225  2.8625e-07]]
0
[[    0.87683  1.1762e-11  4.8893e-07    0.081945    0.041225  2.8625e-07]]
0



0: 352x640 1 person, 748.5ms
Speed: 1.0ms preprocess, 748.5ms inference, 4.0ms postprocess per image at shape (1, 3, 640, 640)


[[    0.86127   1.582e-11  5.1615e-07    0.066912    0.071822  2.1253e-07]]
0
