In [1]:
## One Time Labeler
# 1 - Generates an index with information about reference series.
## 1.1 - Selects the video for analysis.
# 2 - Analyzes the graphs of the selected video series
# 3 - Loads frames from the selected video for further image analysis 
## 3.1 - select the range of frames to be loaded or load all video frames (maximum of 1000 frames).
## 3.2 - Displays frames for image analysis - enter values for the range of frames to be shown.
# 4 - Reads or creates the VD_LABELED file if it does not exist.
# 5 - Adds information to label the frames.
# 6 - Functions to verify the added labels.
## 6.1 - Gets all saved label classes.
## 6.2 - Gets the frames marked with a selected class.
## 6.3 - Gets reference measurements for each class.
## 6.4 - Plots a graph marking the start and end of the labels for a class.
# 7 - Saves the VD_LABELED file to disk.

In [44]:
import time
import os
import numpy as np
import sys
import pandas as pd
import glob
import matplotlib.pyplot as plt
import ast
import cv2

from dash import Dash, dcc, html, Input, Output, ctx
import plotly.express as px
import plotly.offline as py
import plotly.graph_objs as go
import webbrowser
from threading import Timer
import json

py.init_notebook_mode(connected=True)
plt.rcParams['font.size'] = 10
plt.rcParams["font.family"] = "Times New Roman"

In [45]:
# Scan Folder and Find
def list_scan_from_path (baseDir, file_name):
    ## 1.VD_FEATURES_Lx.CSV
    MAIN_VD_FEATURES_LX = []
    # Scan the Folder and Save the list[]
    FOLDER_CSV_SCAN = glob.iglob( baseDir + os.sep + '**' + os.sep + '*.CSV', recursive=True )
    # Order By Name
    FOLDER_CSV_SCAN = sorted(FOLDER_CSV_SCAN)
    # Interate frame by frame
    for filename in FOLDER_CSV_SCAN:
        #print (filename.find('VD_INFO') != -1)
        if (filename.find(file_name) != -1) == True:
            #print (filename)
            MAIN_VD_FEATURES_LX.append(filename)
        #
    # Return the list
    return MAIN_VD_FEATURES_LX
#

In [46]:
# Collect the base
def collect_basex (original_path):
    xpath = original_path.split(os.sep)
    new_path = ''
    ii=0
    for parts in xpath:
        if ii <= len(xpath)-2:
            new_path += parts + str(os.sep)
        ii += 1
    # Return
    return new_path
#<chg

In [47]:
# Function to plot serie
def plot_time_series(time, values, label):
    plt.figure(figsize=(10,6))
    plt.plot(time, values)
    plt.xlabel("Time", fontsize=20)
    plt.ylabel("Value", fontsize=20)
    plt.title(label, fontsize=20)
    plt.grid(True)

In [48]:
def READ_CSV_FILE(base_path, file_name=None):
    if file_name:
        file_path = os.path.join(base_path, file_name)
    else: file_path = base_path
    # Read the Dataframe from CSV
    current_dt = pd.read_csv(file_path)
    try:
        # Remove the Unamed columns
        current_dt.drop(columns=["Unnamed: 0"], inplace=True)
    except: pass
    return current_dt

In [49]:
def CREATE_LABELED_INDEX(FILE_LOCATION_TREE_DLOCAL, VD_INFO_FILE_NAME = 'VD_INFO.CSV', VD_LABEL_FILE_NAME = 'VD_LABELED_L0.CSV'):
    INDEX_REF_DT = pd.DataFrame()
    
    for measure_path in FILE_LOCATION_TREE_DLOCAL:
        current_path = collect_basex(measure_path)
        path_vd_info = os.path.join(current_path, VD_INFO_FILE_NAME)
        # Read the Dataframe from CSV
        current_vd_info = pd.read_csv(path_vd_info)
        # Remove the Unamed columns
        current_vd_info.drop(columns=["Unnamed: 0"], inplace=True)
        SELECT_DT = current_vd_info[['video_id', 'link_video', 'duration_vid', 'total_frames']].copy()
        path_vd_label = os.path.join(current_path, VD_LABEL_FILE_NAME)
        if os.path.exists(path_vd_label):
            SELECT_DT.loc[:,'label_file_exist']=1
        else:
            SELECT_DT.loc[:,'label_file_exist']=0
        SELECT_DT['path'] = current_path
        INDEX_REF_DT = pd.concat([INDEX_REF_DT, SELECT_DT], ignore_index=True)
    INDEX_REF_DT = INDEX_REF_DT.set_index(pd.Index(INDEX_REF_DT['video_id']))
    INDEX_REF_DT.drop(columns=["video_id"], inplace=True)
        
    return INDEX_REF_DT

In [50]:
# Helper function to organize the marked frames in a list into sublists containing continuous frames.
def separate_intervals(lst):
    # Sort the list to ensure numbers are in ascending order
    sorted_list = sorted(lst)
    
    # Initialize the intervals list
    intervals = []
    
    # Initialize the start and end of interval
    start_interval = sorted_list[0]
    end_interval = sorted_list[0]
    
    # Iterate over the sorted list
    for num in sorted_list[1:]:
        # If the current number is equal to the previous number + 1, continue the interval
        if num == end_interval + 1:
            end_interval = num
        # If not, the interval has ended, so add it to the intervals list and start a new interval
        else:
            intervals.append((start_interval, end_interval))
            start_interval = num
            end_interval = num
    
    # Add the last interval to the intervals list
    intervals.append((start_interval, end_interval))
    
    return intervals

In [51]:
# Function to plot a graph with markers for the classes
def PLOT_CLASS_GRAPH(VD_LABELED_DT, class_in, start_frame=None, end_frame=None):
    fonte = {'family': "Times New Roman", 'color': 'black', 'weight': 'bold', 'size': 10}
    
    get_measur = GET_MEASURES_FROM_CLASS (VD_LABELED_DT, class_in)
    frames_f_class = GET_FRAMES_FROM_CLASS(VD_LABELED_DT, class_in)
    frames_f_class= separate_intervals(frames_f_class)
    PLOT_DT = VD_MEASURE_DT_V2[get_measur].copy()

    if start_frame is not None and end_frame is not None:
        PLOT_DT = PLOT_DT[start_frame:end_frame+1]
        
    # Plot graph
    fig, ax = plt.subplots(figsize=(9, 3))
    #plt.figure(figsize=(fig_width, fig_height))
    ax.plot(PLOT_DT.index, PLOT_DT, label=get_measur)
    
    for interval in frames_f_class:
        ax.fill_between(interval, 0, 1, alpha=0.2, transform=ax.get_xaxis_transform(), label=f'{class_in}: {interval}')
        #ax.text(interval[0], 6, f'{interval[0]}', fontsize=10)
        #ax.text(interval[1], 6, f'{interval[1]}', fontsize=10)
        #ax.annotate(f'{interval}', xy=(2, 1), xytext=(3, 4), arrowprops=dict(facecolor='black', shrink=0.05))
    #plt.xticks(PLOT_DT.index)
    ax.set_ylim(ymin=0)
    ax.set_xlim(xmin=0)
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.ylabel('Amplitude (pixel)', fontdict=fonte)
    plt.xlabel('Frame number', fontdict=fonte)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('graph.png')
    plt.show()

In [52]:
# --------------------------------------------------------------------

In [53]:
# Init process
# Variables

# LOCAL DIR
tree_DIR_LD = 'Dataset' + os.sep + 'YT-Online'
baseDir_LD = '..' + os.sep + tree_DIR_LD + os.sep

# Tree List
FILE_LOCATION_TREE_DLOCAL = []

In [54]:
# File_name to find
file_name_fd = 'VD_SUBTITLES.CSV'
# Call the basic function to find all VD_MEASURE_L0 files
FILE_LOCATION_TREE_DLOCAL = list_scan_from_path (baseDir_LD, file_name_fd)

In [55]:
FILE_LOCATION_TREE_DLOCAL

['..\\Dataset\\YT-Online\\VD_Y_0000000001\\VD_SUBTITLES.CSV',
 '..\\Dataset\\YT-Online\\VD_Y_0000000002\\VD_SUBTITLES.CSV',
 '..\\Dataset\\YT-Online\\VD_Y_0000000003\\VD_SUBTITLES.CSV',
 '..\\Dataset\\YT-Online\\VD_Y_0000000004\\VD_SUBTITLES.CSV',
 '..\\Dataset\\YT-Online\\VD_Y_0000000005\\VD_SUBTITLES.CSV',
 '..\\Dataset\\YT-Online\\VD_Y_0000000006\\VD_SUBTITLES.CSV',
 '..\\Dataset\\YT-Online\\VD_Y_0000000007\\VD_SUBTITLES.CSV',
 '..\\Dataset\\YT-Online\\VD_Y_0000000008\\VD_SUBTITLES.CSV',
 '..\\Dataset\\YT-Online\\VD_Y_0000000009\\VD_SUBTITLES.CSV',
 '..\\Dataset\\YT-Online\\VD_Y_0000000011\\VD_SUBTITLES.CSV',
 '..\\Dataset\\YT-Online\\VD_Y_0000000012\\VD_SUBTITLES.CSV',
 '..\\Dataset\\YT-Online\\VD_Y_0000000013\\VD_SUBTITLES.CSV',
 '..\\Dataset\\YT-Online\\VD_Y_0000000014\\VD_SUBTITLES.CSV',
 '..\\Dataset\\YT-Online\\VD_Y_0000000015\\VD_SUBTITLES.CSV',
 '..\\Dataset\\YT-Online\\VD_Y_0000000016\\VD_SUBTITLES.CSV',
 '..\\Dataset\\YT-Online\\VD_Y_0000000017\\VD_SUBTITLES.CSV',
 '..\\Da

In [57]:
word = 'HAVE'
result_final = pd.DataFrame()
aux = pd.DataFrame(data=[0])
# Função para verificar se a palavra específica está presente em cada texto
def search_word(texto):
    return word in texto

for current_path in FILE_LOCATION_TREE_DLOCAL:
    SUB_DT = READ_CSV_FILE(current_path)
    SUB_DT['finded_word'] = SUB_DT['text'].apply(search_word)
    result = SUB_DT.loc[SUB_DT['finded_word']]
    if len(result) > 0:
        print(current_path)
        print(result.iloc[:,0:3])
        result_select = result.iloc[:,0:3]
        result_select['path'] = current_path
        result_final = pd.concat([result_final,result_select])
        
        result_final = pd.concat([result_final,aux])

result_final.to_csv(f'{word}.csv', index=False)

..\Dataset\YT-Online\VD_Y_0000000002\VD_SUBTITLES.CSV
                                       text    start  duration
17  FREE VACCINES HAVE BEEN AVAILABLE IN 80   41.120     5.279
19          WE STILL HAVE NEARLY 80 MILLION   46.399     3.680
20     AMERICANS WHO HAVE FAILED TO GET THE   48.160     3.440
46     LARGE MAJORITY OF AMERICANS WHO HAVE  106.560     3.839
..\Dataset\YT-Online\VD_Y_0000000004\VD_SUBTITLES.CSV
                                       text  start  duration
4  HAVE DEVELOPED A FORMULA TO HELP YOU ACE  11.37      6.09
..\Dataset\YT-Online\VD_Y_0000000005\VD_SUBTITLES.CSV
                                      text  start  duration
21  YOU HAVE WITH CLIENT WHERE YOU HAVE TO  43.68      4.23
..\Dataset\YT-Online\VD_Y_0000000006\VD_SUBTITLES.CSV
                                         text    start  duration
64      IF YOU HAVEN'T DRESSED PROFESSIONALLY  114.560     4.080
76            INTERVIEW YOU HAVE GOOD POSTURE  135.840     5.440
103                     HAVE THA

In [43]:
result_final

In [16]:
#
## 1.1 - Selects the video for analysis and CSV data file
#
# Type the video id to label
VIDEO_ID = 1
#
# Read CSV data file
VD_MEASURE_FILE_NAME = 'VD_SUBTITLES.CSV'
VD_MEASURE_DT = READ_CSV_FILE(FILE_LOCATION_TREE_DLOCAL[VIDEO_ID-1])

In [17]:
#VD_MEASURE_DT_V2.describe()

In [18]:
# <end