In [8]:
import random
import time
import numpy as np
import matplotlib.pyplot as plt
import cv2
import glob
from Levenshtein import distance
import easyocr
import pickle
from pytube import YouTube
import json
import moviepy.editor as mp
import os
import face_recognition

reader = easyocr.Reader(['en'])

CUDA not available - defaulting to CPU. Note: This module is much faster with a GPU.


In [2]:
def download_by_url(vid_url,video_save_path="video",audio_save_path="audio"):
    vid = YouTube(vid_url)
    # download video only
    vid_stream = vid.streams.filter(file_extension = "mp4").get_highest_resolution()
    print('downloading video..')
    vid_path = vid_stream.download(video_save_path)
    vid_json = {}
    vid_json["url"] = vid_url
    vid_json["res"] = vid_stream.resolution
    vid_json["mimetype"] = vid_stream.mime_type
    vid_json["fps"] = vid_stream.fps
    vid_json["filepath"] = vid_path
    jsonfile = video_save_path+"/"+vid_stream.default_filename[0:-3]+"json"
    with open(jsonfile, 'w') as f:
        f.write(json.dumps(vid_json))
    print(vid_path, "downloaded")
 
    return vid_path

In [3]:
def video_to_audio(video_path, output_path):
    clip = mp.VideoFileClip("{}".format(video_path))
    filename = os.path.basename(video_path)[:-4] + '.wav'
    print("processing audio {}".format(filename))
    output_path = os.path.join(output_path, filename)
    clip.audio.write_audiofile("{}".format(output_path))
    print('{} is processed'.format(output_path))

In [4]:
def analyze_video(video, start_frame=0, end_frame=None, sampling_rate=1):
    """Find chicken dinner in video"""
    w, h, fps, num_frames = get_detail(video)
    print(w, h, fps, num_frames)
    start_frame = start_frame
    end_frame = end_frame if end_frame else num_frames
    step = int(round(sampling_rate * fps))
    start = time.time()
    cap = cv2.VideoCapture(video)
    i = start_frame
    output = []
    all_res = []
    while cap.isOpened() and i < end_frame:
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
#         print(i)
        ret, frame = cap.read()
#         print(frame)
#         try:
        temp_res = run_inference(frame)
#         except:
#             return frame
        all_res.append(temp_res)
        i += step
 
    cap.release()
    end = time.time()
 
    return {
        'start_frame' : start_frame,
        'end_frame' : end_frame,
        'result_feature': all_res,
        'step':step,
        'fps':fps
    }
 
def get_detail(vid_file):
    cap = cv2.VideoCapture(vid_file)
 
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
 
    cap.release()
 
    return width, height, fps, total_frames
 
def run_inference(frame):
    all_text_here = reader.readtext(frame) #OCR
    all_faces_locations = face_recognition.face_locations(frame)
    all_faces_features_here = face_recognition.face_encodings(frame)#FACE
    
    return {
        'text':all_text_here,
        'faces_location':all_faces_locations,
        'faces_features':all_faces_features_here
    }

In [5]:
video_path = download_by_url('https://www.youtube.com/watch?v=VxTS_y48p8Q')

downloading video..
D:\facerecog-proj\extract_metadata\video\NO DONT DO THAT.mp4 downloaded


In [6]:
video_to_audio(video_path, 'audio')

                                                                                                                       

processing audio NO DONT DO THAT.wav
MoviePy - Writing audio in audio\NO DONT DO THAT.wav
MoviePy - Done.
audio\NO DONT DO THAT.wav is processed




In [9]:
result = analyze_video(video_path)

720 720 30.0 75


In [10]:
result

{'start_frame': 0,
 'end_frame': 75,
 'result_feature': [{'text': [([[533, 655],
      [691, 655],
      [691, 691],
      [533, 691]],
     'kmlkmljkl',
     0.9998670374858656)],
   'faces_location': [(206, 419, 527, 98)],
   'faces_features': [array([-0.24921907,  0.05025495,  0.10706532, -0.09727328, -0.15834039,
           -0.07409187, -0.04551383, -0.16956849,  0.23184876, -0.12725341,
            0.1697679 , -0.07844635, -0.23184238,  0.01537246,  0.01732729,
            0.18358818, -0.11586878, -0.18069436, -0.08061923, -0.08368039,
            0.01600413,  0.04038862,  0.04016113,  0.10593469, -0.14617459,
           -0.36535364, -0.12415865, -0.02861067,  0.02669815, -0.00697815,
           -0.01903916,  0.01165271, -0.26539043, -0.06153582,  0.02698839,
            0.17649031, -0.09658679, -0.16987276,  0.18428867, -0.01941098,
           -0.25854144, -0.01464555,  0.12439124,  0.1993975 ,  0.22923203,
            0.01113797, -0.01318528, -0.07733284,  0.12812124, -0.2779496

In [9]:
result.keys()

dict_keys(['start_frame', 'end_frame', 'result_feature', 'step', 'fps'])

In [13]:
def distance_text(a,b):
    if max(len(a),len(b))==0:
        return 1
    return distance(a,b)/max(len(a),len(b))
def reshape_text(data1, data2, idx):
    for word in data2:
        used_word = word[1].lower()
        found = False
        for key in data1.keys():
            dist = distance_text(key,used_word)
            if dist<0.1:
                
                if data1[key][-1]['end']+1 == idx:
                    data1[key][-1]['end']+=1
                else:
                    data1[key].append({'start':idx, 'end':idx})
                found = True
                break
        if not found:
#             print('word : ',word[1].lower())
#             print(data1)
            data1[used_word] = [{'start':idx, 'end':idx}]
    return data1
    
def aggregate_text(all_text):
    aggregated_text = {}
    for idx,txt in enumerate(all_text):
        aggregated_text = reshape_text(aggregated_text,txt,idx)
    return aggregated_text

In [11]:
for key in all_meta.keys():
    print(key)
    res_text = aggregate_text([i['text'] for i in result[key]['result_feature']])
    result[key]['agg_text'] = res_text

NameError: name 'all_meta' is not defined

In [11]:
result

{'start_frame': 0,
 'end_frame': 75,
 'result_feature': [{'text': [([[533, 655],
      [691, 655],
      [691, 691],
      [533, 691]],
     'kmlkmljkl',
     0.9998670374858656)],
   'faces_location': [(206, 419, 527, 98)],
   'faces_features': [array([-0.24921907,  0.05025495,  0.10706532, -0.09727328, -0.15834039,
           -0.07409187, -0.04551383, -0.16956849,  0.23184876, -0.12725341,
            0.1697679 , -0.07844635, -0.23184238,  0.01537246,  0.01732729,
            0.18358818, -0.11586878, -0.18069436, -0.08061923, -0.08368039,
            0.01600413,  0.04038862,  0.04016113,  0.10593469, -0.14617459,
           -0.36535364, -0.12415865, -0.02861067,  0.02669815, -0.00697815,
           -0.01903916,  0.01165271, -0.26539043, -0.06153582,  0.02698839,
            0.17649031, -0.09658679, -0.16987276,  0.18428867, -0.01941098,
           -0.25854144, -0.01464555,  0.12439124,  0.1993975 ,  0.22923203,
            0.01113797, -0.01318528, -0.07733284,  0.12812124, -0.2779496

In [14]:
res_text = aggregate_text([i['text'] for i in result['result_feature']])

In [15]:
result['agg_text'] = res_text

In [21]:
result['result_feature']

[{'text': [([[533, 655], [691, 655], [691, 691], [533, 691]],
    'kmlkmljkl',
    0.9998670374858656)],
  'faces_location': [(206, 419, 527, 98)],
  'faces_features': [array([-0.24921907,  0.05025495,  0.10706532, -0.09727328, -0.15834039,
          -0.07409187, -0.04551383, -0.16956849,  0.23184876, -0.12725341,
           0.1697679 , -0.07844635, -0.23184238,  0.01537246,  0.01732729,
           0.18358818, -0.11586878, -0.18069436, -0.08061923, -0.08368039,
           0.01600413,  0.04038862,  0.04016113,  0.10593469, -0.14617459,
          -0.36535364, -0.12415865, -0.02861067,  0.02669815, -0.00697815,
          -0.01903916,  0.01165271, -0.26539043, -0.06153582,  0.02698839,
           0.17649031, -0.09658679, -0.16987276,  0.18428867, -0.01941098,
          -0.25854144, -0.01464555,  0.12439124,  0.1993975 ,  0.22923203,
           0.01113797, -0.01318528, -0.07733284,  0.12812124, -0.27794963,
           0.03525332,  0.17514142,  0.02196804,  0.01833277,  0.17695688,
         

In [22]:
result

{'start_frame': 0,
 'end_frame': 75,
 'result_feature': [{'text': [([[533, 655],
      [691, 655],
      [691, 691],
      [533, 691]],
     'kmlkmljkl',
     0.9998670374858656)],
   'faces_location': [(206, 419, 527, 98)],
   'faces_features': [array([-0.24921907,  0.05025495,  0.10706532, -0.09727328, -0.15834039,
           -0.07409187, -0.04551383, -0.16956849,  0.23184876, -0.12725341,
            0.1697679 , -0.07844635, -0.23184238,  0.01537246,  0.01732729,
            0.18358818, -0.11586878, -0.18069436, -0.08061923, -0.08368039,
            0.01600413,  0.04038862,  0.04016113,  0.10593469, -0.14617459,
           -0.36535364, -0.12415865, -0.02861067,  0.02669815, -0.00697815,
           -0.01903916,  0.01165271, -0.26539043, -0.06153582,  0.02698839,
            0.17649031, -0.09658679, -0.16987276,  0.18428867, -0.01941098,
           -0.25854144, -0.01464555,  0.12439124,  0.1993975 ,  0.22923203,
            0.01113797, -0.01318528, -0.07733284,  0.12812124, -0.2779496