In [3]:
import cv2
import numpy as np
import face_detection
from collections import defaultdict
import os, python_speech_features
import scipy.io.wavfile as wav
import random
import pandas as pd
import pickle
detector = face_detection.build_detector(
  "DSFDDetector", confidence_threshold=.2, nms_iou_threshold=.2)
pathVideo = "C:/Users/jmmol/Desktop/LIP-RTVE/MP4s"
pathMFCC = "C:/Users/jmmol/Desktop/COSAS V7/TFM/mfccs"
pathAudio = "C:/Users/jmmol/Desktop/LIP-RTVE/WAVs"
pathFaces = "C:/Users/jmmol/Desktop/COSAS V7/TFM/npz"



In [11]:
def createFolders(path):
    """
    Crea una carpeta por cada hablante y dentro una por cada vídeo de ese hablante
    También crea un diccionario que incluye los hablantes y cada uno de los nombres de sus vídeos y lo devuelve
    Ej: speaker000/001
    """
    speakerDict = defaultdict(list)
    for (root,dirs,files) in os.walk(path, topdown=True):
        #print (dirs)
        for speaker in dirs:
            os.makedirs("imgs/"+speaker, exist_ok=True)
            os.makedirs("mfccs/"+speaker, exist_ok=True)
            os.makedirs("npz/"+speaker, exist_ok=True)
        #print (files)
        for f in files:
            speaker, nmuestra = f.split("_")
            os.makedirs("imgs/"+speaker+"/"+nmuestra.split(".")[0], exist_ok=True)
            speakerDict[speaker].append(f[:-4]) #Quitamos .mp4
    print ('Directorios creados')
    return speakerDict

def extractBiggestFace(img):
    """
    Detecta todas las caras de una imagen y devuelve la más grande recortada y reescalada a 112x112
    """
    detections = detector.detect(img)
    idx_max = -1
    area_max = -1
    for i,cntr in enumerate(detections):
        xmin,ymin,xmax,ymax = int(cntr[0]),int(cntr[1]),int(cntr[2]),int(cntr[3]) #Guardamos bounding box
        area = (xmax-xmin)*(ymax-ymin)
        if area > area_max: #Comprobamos si la cara es la más grande
            idx_max = i
            area_max = area
            #print(area,idx_max)
        #cv2.rectangle(img, (xmin, ymin), (xmax, ymax), (0, 0, 255), 2)

    cntr = detections[idx_max]
    try:
        xmin,ymin,xmax,ymax = int(cntr[0]),int(cntr[1]),int(cntr[2]),int(cntr[3])
        return cv2.resize(img[max(ymin,0):ymax, xmin:xmax], (112, 112)) #Cara detectada, reescalamos
    except:
        print(cntr)
        cv2.imshow('image',img)
        cv2.waitKey(0)

def saveFaceCrops(videoPath):
    speaker, videoID = videoPath.split("/")[-2:]
    videoID = videoID.split("_")[1]
    print(speaker, videoID)
    vidcap = cv2.VideoCapture(videoPath)
    success,image = vidcap.read()
    count = 0
    facesCount = 0
    while success:
        cv2.imwrite("imgs/"+speaker+"/"+videoID+"/"+str(count)+'.jpg', extractBiggestFace(image)) #Detectamos y guardamos cara
        success,image = vidcap.read()
        count += 1
    return count+1 #Devuelve número de frames

def convertToNPZ(speakerDict, lengthsFilename):
    """
    Convierte caras recortadas en npz en su directorio y devuelve longitudes de video
    """
    videoLength = defaultdict(list)
    for speaker in speakerDict:
        for sample in speakerDict[speaker]:
            folder = "C:/Users/jmmol/Desktop/COSAS V7/TFM/imgs/"+speaker+"/"+sample.split("_")[1]
            images = []
            #numFrames = len(os.listdir(folder))
            for filename in sorted(os.listdir(folder),key=lambda x: int(x)):
                img = cv2.imread(os.path.join(folder,filename))
                if img is not None:
                    images.append(img)
            numFrames = len(images)
            videoLength["videos"].append(sample)
            videoLength["lengths"].append(numFrames)
            np.savez_compressed(r"C:/Users/jmmol/Desktop/COSAS V7/TFM/npz/"+speaker+"/"+sample,images=images)
    with open(lengthsFilename, 'wb') as f:
        pickle.dump(videoLength, f)

def extractMFCC(speakerDict,pathMFCC,pathAudio,pathNPZ):
    for speaker in speakerDict:
        for sample in speakerDict[speaker]:
            _,sig = wav.read(pathAudio+"/"+speaker+"/"+sample+".wav")
            videoRec = np.load(pathNPZ+"/"+speaker+"/"+sample+".npz")["images"]
            maxAudio = len(videoRec)*4 #Video a 25hz, audio a 100hz
            audio = python_speech_features.mfcc(sig, 16000, numcep = 13, winlen = 0.025, winstep = 0.010)
            if audio.shape[0] < maxAudio: #Si es un poco más corto hacemos padding
                shortage = maxAudio - audio.shape[0]
                audio = np.pad(audio, ((0, shortage), (0,0)), 'wrap')
            audio = audio[:maxAudio,:] #Se recorta
            np.savez_compressed(pathMFCC+"/"+speaker+"/"+sample,mfcc=audio)

In [10]:
speakerDict = createFolders(pathVideo)
#for speaker in sorted(list(speakerDict.keys())):
#    for video in speakerDict[speaker]:
#        saveFaceCrops(pathVideo+"/"+speaker+"/"+video)
convertToNPZ(speakerDict,"lengths.pickle")
#extractMFCC(speakerDict,pathMFCC,pathAudio,pathFaces)

Directorios creados


In [25]:
fotos = np.load(r"C:\Users\jmmol\Desktop\COSAS V7\TFM\npz\speaker334\speaker334_0000.npz")["images"]
audio = np.load(r"C:\Users\jmmol\Desktop\COSAS V7\TFM\mfccs\speaker334\speaker334_0000.npz")["mfcc"]
print(len(fotos),len(audio))
#cv2.imshow("test",fotos[-1])
#cv2.waitKey(0)

29 116


In [20]:
def createSamples(splitSpeakers,nSamples,videoLengths,pathFaces,pathMFCC):
    n = 0
    df = pd.DataFrame(columns=['video', 'audio', 'type', 'center', 'audiocenter'])
    sampleTypes = ["positive","partialMismatch","completeMismatch"]
    speakers = list(speakerDict.keys())
    while n < nSamples: #TIENE QUE SER DISTINTOS VIDEOS
        sType = random.choice(sampleTypes)
        speakerVideo = random.choice(speakers)
        videoSample = random.choice(speakerDict[speakerVideo])
        videoLength = videoLengths[videoSample]
        print(videoSample,videoLength)
        center = random.randint(0,videoLength)
        if sType == "positive":
            audioSample = videoSample #AUDIO SAMPLE
            audioCenter = random.randint(0,videoLength*4)
            pass
        if sType == "partialMismatch":
            speakerAudio = speakerVideo
            pass
        if sType == "completeMismatch":
            speakerAudio = random.choice([s for s in speakers if s != speakerVideo])
            pass

def splitToDict(speakerList,lengths):
    speakerDict = defaultdict(list)
    for speaker in speakerList:
        speakerDict[speaker[:-5]].append(speaker)
    return speakerDict

In [21]:
with open('lengths.pickle', 'rb') as f: #Diccionario con clave de videos y clave de longitudes, ambas listas se corresponden
    lengths = pickle.load(f)
lengths = dict(zip(lengths["videos"],lengths["lengths"]))
print(lengths)
trainDF = pd.read_csv(r"C:\Users\jmmol\Desktop\LIP-RTVE\SPLITS\speaker-independent\train.csv")
trainSpeakers = splitToDict(trainDF["sampleID"].values.tolist(),lengths)
createSamples(trainSpeakers,10,lengths,pathFaces,pathMFCC)

{'speaker000_0000': 183, 'speaker000_0001': 110, 'speaker000_0002': 168, 'speaker000_0003': 316, 'speaker000_0004': 109, 'speaker000_0005': 149, 'speaker000_0006': 379, 'speaker000_0007': 299, 'speaker000_0008': 101, 'speaker000_0009': 261, 'speaker000_0010': 219, 'speaker000_0011': 293, 'speaker000_0012': 256, 'speaker000_0013': 378, 'speaker000_0014': 182, 'speaker000_0015': 135, 'speaker000_0016': 220, 'speaker000_0017': 229, 'speaker000_0018': 282, 'speaker000_0019': 373, 'speaker000_0020': 256, 'speaker000_0021': 374, 'speaker000_0022': 293, 'speaker000_0023': 138, 'speaker000_0024': 116, 'speaker000_0025': 50, 'speaker000_0026': 207, 'speaker000_0027': 160, 'speaker000_0028': 152, 'speaker000_0029': 301, 'speaker000_0030': 197, 'speaker000_0031': 65, 'speaker000_0032': 171, 'speaker000_0033': 367, 'speaker000_0034': 276, 'speaker000_0035': 196, 'speaker000_0036': 328, 'speaker000_0037': 242, 'speaker000_0038': 254, 'speaker000_0039': 209, 'speaker000_0040': 261, 'speaker000_0041'

KeyboardInterrupt: 