In [9]:
import cv2
import numpy as np
import face_detection
from collections import defaultdict
import os, python_speech_features
import scipy.io.wavfile as wav
import random
import pandas as pd
detector = face_detection.build_detector(
  "DSFDDetector", confidence_threshold=.2, nms_iou_threshold=.2)
pathVideo = "C:/Users/jmmol/Desktop/LIP-RTVE/MP4s"
pathMFCC = "C:/Users/jmmol/Desktop/COSAS V7/TFM/mfccs"
pathAudio = "C:/Users/jmmol/Desktop/LIP-RTVE/WAVs"
pathFaces = "C:/Users/jmmol/Desktop/COSAS V7/TFM/npz"



In [7]:
def createFolders(path):
    """
    Crea una carpeta por cada hablante y dentro una por cada vídeo de ese hablante
    También crea un diccionario que incluye los hablantes y cada uno de los nombres de sus vídeos y lo devuelve
    Ej: speaker000/001
    """
    speakerDict = defaultdict(list)
    for (root,dirs,files) in os.walk(path, topdown=True):
        #print (dirs)
        for speaker in dirs:
            os.makedirs("imgs/"+speaker, exist_ok=True)
            os.makedirs("mfccs/"+speaker, exist_ok=True)
            os.makedirs("npz/"+speaker, exist_ok=True)
        #print (files)
        for f in files:
            speaker, nmuestra = f.split("_")
            os.makedirs("imgs/"+speaker+"/"+nmuestra.split(".")[0], exist_ok=True)
            speakerDict[speaker].append(f[:-4]) #Quitamos .mp4
    print ('Directorios creados')
    return speakerDict

def extractBiggestFace(img):
    """
    Detecta todas las caras de una imagen y devuelve la más grande recortada y reescalada a 112x112
    """
    detections = detector.detect(img)
    idx_max = -1
    area_max = -1
    for i,cntr in enumerate(detections):
        xmin,ymin,xmax,ymax = int(cntr[0]),int(cntr[1]),int(cntr[2]),int(cntr[3]) #Guardamos bounding box
        area = (xmax-xmin)*(ymax-ymin)
        if area > area_max: #Comprobamos si la cara es la más grande
            idx_max = i
            area_max = area
            #print(area,idx_max)
        #cv2.rectangle(img, (xmin, ymin), (xmax, ymax), (0, 0, 255), 2)

    cntr = detections[idx_max]
    try:
        xmin,ymin,xmax,ymax = int(cntr[0]),int(cntr[1]),int(cntr[2]),int(cntr[3])
        return cv2.resize(img[max(ymin,0):ymax, xmin:xmax], (112, 112)) #Cara detectada, reescalamos
    except:
        print(cntr)
        cv2.imshow('image',img)
        cv2.waitKey(0)

def saveFaceCrops(videoPath):
    
    speaker, videoID = videoPath.split("/")[-2:]
    videoID = videoID.split("_")[1]
    print(speaker, videoID)
    vidcap = cv2.VideoCapture(videoPath)
    success,image = vidcap.read()
    count = 0
    facesCount = 0
    while success:
        cv2.imwrite("imgs/"+speaker+"/"+videoID+"/"+str(count)+'.jpg', extractBiggestFace(image)) #Detectamos y guardamos cara
        success,image = vidcap.read()
        count += 1
    return count+1 #Devuelve número de frames

def convertToNPZ(speakerDict):
    """
    Convierte caras recortadas en npz en su directorio
    """
    for speaker in speakerDict:
        for sample in speakerDict[speaker]:
            folder = "C:/Users/jmmol/Desktop/COSAS V7/TFM/imgs/"+speaker+"/"+sample.split("_")[1]
            images = []
            for filename in sorted(os.listdir(folder),key=lambda x: int(x)):
                img = cv2.imread(os.path.join(folder,filename))
                if img is not None:
                    images.append(img)
            #return images
            np.savez_compressed(r"C:/Users/jmmol/Desktop/COSAS V7/TFM/npz/"+speaker+"/"+sample,images=images)
def extractMFCC(speakerDict,pathMFCC,pathAudio,pathNPZ):
    for speaker in speakerDict:
        for sample in speakerDict[speaker]:
            _,sig = wav.read(pathAudio+"/"+speaker+"/"+sample+".wav")
            videoRec = np.load(pathNPZ+"/"+speaker+"/"+sample+".npz")["images"]
            maxAudio = len(videoRec)*4 #Video a 25hz, audio a 100hz
            audio = python_speech_features.mfcc(sig, 16000, numcep = 13, winlen = 0.025, winstep = 0.010)
            if audio.shape[0] < maxAudio: #Si es un poco más corto hacemos padding
                shortage = maxAudio - audio.shape[0]
                audio = np.pad(audio, ((0, shortage), (0,0)), 'wrap')
            audio = audio[:maxAudio,:] #Se recorta
            np.savez_compressed(pathMFCC+"/"+speaker+"/"+sample,mfcc=audio)

In [8]:
speakerDict = createFolders(pathVideo)
#for speaker in sorted(list(speakerDict.keys())):
#    for video in speakerDict[speaker]:
#        saveFaceCrops(pathVideo+"/"+speaker+"/"+video)
#convertToNPZ(speakerDict);
#extractMFCC(speakerDict,pathMFCC,pathAudio,pathFaces)

Directorios creados


In [25]:
fotos = np.load(r"C:\Users\jmmol\Desktop\COSAS V7\TFM\npz\speaker334\speaker334_0000.npz")["images"]
audio = np.load(r"C:\Users\jmmol\Desktop\COSAS V7\TFM\mfccs\speaker334\speaker334_0000.npz")["mfcc"]
print(len(fotos),len(audio))
#cv2.imshow("test",fotos[-1])
#cv2.waitKey(0)

29 116


In [11]:
def createSamples(splitSpeakers,nSamples,pathFaces,pathMFCC):
    n = 0
    df = pd.DataFrame(columns=['video', 'audio', 'type', 'center', 'audiocenter'])
    sampleTypes = ["positive","partialMismatch","completeMismatch"]
    speakers = list(speakerDict.keys())
    while n < nSamples: #TIENE QUE SER DISTINTOS VIDEOS
        sType = random.choice(sampleTypes)
        speakerVideo = random.choice(speakers)
        videoSample = random.choice(speakerDict[speakerVideo])
        videoLength = len(np.load(pathFaces+"/"+speakerVideo+"/"+videoSample+".npz")["images"])
        print(videoSample,videoLength)
        center = random.randint(0,videoLength)
        if sType == "positive":
            speakerAudio = speakerVideo
            audioCenter = random.randint(0,videoLength*4)
            pass
        if sType == "partialMismatch":
            speakerAudio = speakerVideo
            pass
        if sType == "completeMismatch":
            speakerAudio = random.choice([s for s in speakers if s != speakerVideo])
            pass

def splitToDict(speakerList):
    speakerDict = defaultdict(list)
    for speaker in speakerList:
        speakerDict[speaker[:-5]].append(speaker)
    return speakerDict

In [15]:
trainDF = pd.read_csv(r"C:\Users\jmmol\Desktop\LIP-RTVE\SPLITS\speaker-independent\train.csv")
trainSpeakers = splitToDict(trainDF["sampleID"].values.tolist())
createSamples(trainSpeakers,10,pathFaces,pathMFCC)

speaker159_0000 30
speaker323_0012 262
speaker015_0040 57
speaker255_0000 157
speaker126_0000 99
speaker230_0005 149
speaker182_0000 31
speaker097_0002 98
speaker146_0002 114
speaker317_0002 63
speaker215_0001 159
speaker226_0008 166
speaker046_0001 68
speaker327_0007 276
speaker094_0010 32
speaker006_0011 75
speaker142_0006 150
speaker070_0008 43
speaker176_0000 83
speaker140_0001 111
speaker034_0015 44
speaker049_0035 98
speaker019_0013 326
speaker051_0008 124
speaker233_0020 167
speaker075_0010 72
speaker274_0012 37
speaker033_0264 35
speaker204_0001 48
speaker160_0000 58
speaker084_0000 60
speaker216_0004 42
speaker167_0000 33
speaker203_0009 73
speaker286_0001 52
speaker127_0005 26
speaker334_0003 77
speaker269_0000 75
speaker076_0003 115
speaker213_0002 95
speaker168_0000 33
speaker264_0004 157
speaker123_0002 47
speaker137_0002 65
speaker178_0002 55
speaker267_0000 67
speaker108_0000 27
speaker058_0024 43
speaker175_0000 155
speaker226_0002 227
speaker236_0000 127
speaker189_000

KeyboardInterrupt: 