In [1]:
import cv2
import numpy as np
import face_detection
from collections import defaultdict
import os, python_speech_features
import scipy.io.wavfile as wav
import random
import pandas as pd
import pickle

detector = face_detection.build_detector(
  "DSFDDetector", confidence_threshold=.2, nms_iou_threshold=.2)
pathVideo = "C:/Users/jmmol/Desktop/LIP-RTVE/MP4s"
pathMFCC = "C:/Users/jmmol/Desktop/COSAS V7/TFM/mfccs"
pathAudio = "C:/Users/jmmol/Desktop/LIP-RTVE/WAVs"
pathFaces = "C:/Users/jmmol/Desktop/COSAS V7/TFM/npz"



In [2]:
def createFolders(path):
    """
    Crea una carpeta por cada hablante y dentro una por cada vídeo de ese hablante
    También crea un diccionario que incluye los hablantes y cada uno de los nombres de sus vídeos y lo devuelve
    Ej: speaker000/001
    """
    speakerDict = defaultdict(list)
    for (root,dirs,files) in os.walk(path, topdown=True):
        #print (dirs)
        for speaker in dirs:
            os.makedirs("imgs/"+speaker, exist_ok=True)
            os.makedirs("mfccs/"+speaker, exist_ok=True)
            os.makedirs("npz/"+speaker, exist_ok=True)
        #print (files)
        for f in files:
            speaker, nmuestra = f.split("_")
            os.makedirs("imgs/"+speaker+"/"+nmuestra.split(".")[0], exist_ok=True)
            speakerDict[speaker].append(f[:-4]) #Quitamos .mp4
    print ('Directorios creados')
    return speakerDict

def extractBiggestFace(img):
    """
    Detecta todas las caras de una imagen y devuelve la más grande recortada y reescalada a 112x112
    """
    detections = detector.detect(img)
    idx_max = -1
    area_max = -1
    for i,cntr in enumerate(detections):
        xmin,ymin,xmax,ymax = int(cntr[0]),int(cntr[1]),int(cntr[2]),int(cntr[3]) #Guardamos bounding box
        area = (xmax-xmin)*(ymax-ymin)
        if area > area_max: #Comprobamos si la cara es la más grande
            idx_max = i
            area_max = area
            #print(area,idx_max)
        #cv2.rectangle(img, (xmin, ymin), (xmax, ymax), (0, 0, 255), 2)

    cntr = detections[idx_max]
    try:
        xmin,ymin,xmax,ymax = int(cntr[0]),int(cntr[1]),int(cntr[2]),int(cntr[3])
        return cv2.resize(img[max(ymin,0):ymax, xmin:xmax], (112, 112)) #Cara detectada, reescalamos
    except:
        print(cntr)
        cv2.imshow('image',img)
        cv2.waitKey(0)

def saveFaceCrops(videoPath):
    speaker, videoID = videoPath.split("/")[-2:]
    videoID = videoID.split("_")[1]
    print(speaker, videoID)
    vidcap = cv2.VideoCapture(videoPath)
    success,image = vidcap.read()
    count = 0
    facesCount = 0
    while success:
        cv2.imwrite("imgs/"+speaker+"/"+videoID+"/"+str(count)+'.jpg', extractBiggestFace(image)) #Detectamos y guardamos cara
        success,image = vidcap.read()
        count += 1
    return count+1 #Devuelve número de frames

def convertToNPZ(speakerDict, lengthsFilename):
    """
    Convierte caras recortadas en npz en su directorio y devuelve longitudes de video
    """
    videoLength = defaultdict(list)
    for speaker in speakerDict:
        for sample in speakerDict[speaker]:
            folder = "C:/Users/jmmol/Desktop/COSAS V7/TFM/imgs/"+speaker+"/"+sample.split("_")[1]
            images = []
            #numFrames = len(os.listdir(folder))
            for filename in sorted(os.listdir(folder),key=lambda x: int(x[:-4])):
                img = cv2.imread(os.path.join(folder,filename))
                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                if img is not None:
                    images.append(img)
            numFrames = len(images)
            videoLength["videos"].append(sample)
            videoLength["lengths"].append(numFrames)
            np.savez_compressed(r"C:/Users/jmmol/Desktop/COSAS V7/TFM/npz/"+speaker+"/"+sample,images=images)
    with open(lengthsFilename, 'wb') as f:
        pickle.dump(videoLength, f)

def extractMFCC(speakerDict,pathMFCC,pathAudio,pathNPZ):
    for speaker in speakerDict:
        for sample in speakerDict[speaker]:
            _,sig = wav.read(pathAudio+"/"+speaker+"/"+sample+".wav")
            videoRec = np.load(pathNPZ+"/"+speaker+"/"+sample+".npz")["images"]
            maxAudio = len(videoRec)*4 #Video a 25hz, audio a 100hz
            print(sample)
            audio = python_speech_features.mfcc(sig, 16000, numcep = 13, winlen = 0.025, winstep = 0.010)
            print(audio)
            if audio.shape[0] < maxAudio: #Si es un poco más corto hacemos padding
                shortage = maxAudio - audio.shape[0]
                audio = np.pad(audio, ((0, shortage), (0,0)), 'wrap')
            audio = audio[:maxAudio,:] #Se recorta
            np.savez_compressed(pathMFCC+"/"+speaker+"/"+sample,mfcc=audio)

In [3]:
speakerDict = createFolders(pathVideo)
#for speaker in sorted(list(speakerDict.keys())):
#    for video in speakerDict[speaker]:
#        saveFaceCrops(pathVideo+"/"+speaker+"/"+video)
#convertToNPZ(speakerDict,"lengths.pickle")
extractMFCC(speakerDict,pathMFCC,pathAudio,pathFaces)

Directorios creados
speaker000_0000
[[ 18.28831223  -0.46966226 -11.374787   ...  18.73777228  -0.37357145
  -18.28428027]
 [ 18.09618294  20.15733158 -23.38681779 ...  29.09880509   8.18839975
  -16.4539791 ]
 [ 17.9964577   22.76016903 -23.52185904 ...  25.42500929   9.06059391
  -10.37243175]
 ...
 [ 14.96840176   2.27996848 -25.07028349 ... -11.52329923 -11.51906584
   -2.23701716]
 [ 14.14441835  -0.46388185 -28.99491367 ...  -8.34985038 -15.21493955
  -11.86932838]
 [ 12.43330159  -2.66013141 -26.58304282 ...  -6.57545189  -2.90625739
   -5.43487585]]
speaker000_0001
[[ 16.70183842 -11.02167938   2.54872229 ...  -4.10236841   1.69644177
   -3.07234674]
 [ 16.51232    -11.99244612   0.35042674 ...  -2.92634596   4.4058777
   -7.10213693]
 [ 16.44934153 -12.7036443    1.33841502 ...  -4.42484138   3.3967609
    0.66481896]
 ...
 [ 14.88942648  -4.10127435  -8.4681879  ...   3.60807836  11.29681514
   -3.19683591]
 [ 14.31759419  -4.48591252 -10.797165   ...   9.91456489   9.4055099

KeyboardInterrupt: 

In [13]:
fotos = np.load(r"C:\Users\jmmol\Desktop\COSAS V7\TFM\npz\speaker334\speaker334_0000.npz")["images"]
audio = np.load(r"C:\Users\jmmol\Desktop\COSAS V7\TFM\mfccs\speaker334\speaker334_0000.npz")["mfcc"]
print(len(fotos),len(audio))
cv2.imshow("test",fotos[-1])
cv2.waitKey(0)

29 116


-1

In [10]:
def createSamples(splitSpeakers,nSamples,videoLengths,filename):
    n = 0
    sampleLabels = ["positive","partialMismatchSameSample","partialMismatchDiffSample","completeMismatch"]
    speakers = list(splitSpeakers.keys())
    sampleList = []
    repeatType = False
    while n < nSamples: #TIENE QUE SER DISTINTOS VIDEOS
        if not repeatType:
            sType = np.random.choice(sampleLabels,p=[0.5,0.166666666,0.166666666,0.166666666])
        speakerVideo = random.choice(speakers)
        videoSample = random.choice(splitSpeakers[speakerVideo])
        videoLength = videoLengths[videoSample]
        center = random.randint(0,videoLength)
        if sType == "positive":
            audioSample = videoSample #AUDIO SAMPLE
            #audioCenter = random.randint(0,videoLength*4)
            sampleLabel = 1
        if sType == "partialMismatchSameSample":
            audioSample = videoSample
            #audioCenter = random.randint(0,videoLengths[audioSample]*4)
            sampleLabel = 0
        if sType == "partialMismatchDiffSample":
            speakerAudio = speakerVideo
            if len(splitSpeakers[speakerVideo]) > 1:
                audioSample = random.choice([s for s in splitSpeakers[speakerVideo] if s != videoSample])
            #audioCenter = random.randint(0,videoLengths[audioSample]*4)
            else: #Solo hay una muestra de este hablante
                audioSample = -1
            sampleLabel = 0   
        if sType == "completeMismatch":
            speakerAudio = random.choice([s for s in speakers if s != speakerVideo])
            audioSample = random.choice(splitSpeakers[speakerAudio])
            #audioCenter = random.randint(0,videoLengths[audioSample]*4)
            sampleLabel = 0

        newRow = {'video':videoSample, 'audio':audioSample, 'label': sampleLabel, 'center': center}
        if newRow not in sampleList and newRow["audio"] != -1:
            sampleList.append(newRow)
            repeatType = False 
            n+=1
        else:
            repeatType = True #Para poder balancear correctamente
    df = pd.DataFrame(sampleList,columns=['video', 'audio', 'label', 'center'])
    df.to_csv(filename)
    return df


def splitToDict(speakerList,lengths):
    speakerDict = defaultdict(list)
    for speaker in speakerList:
        speakerDict[speaker[:-5]].append(speaker)
    return speakerDict

In [4]:
with open('lengths.pickle', 'rb') as f: #Diccionario con clave de videos y clave de longitudes, ambas listas se corresponden
    lengths = pickle.load(f)
lengths = dict(zip(lengths["videos"],lengths["lengths"]))
trainDF = pd.read_csv("C:/Users/jmmol/Desktop/LIP-RTVE/SPLITS/speaker-independent/train.csv")
trainSpeakers = splitToDict(trainDF["sampleID"].values.tolist(),lengths)
df = createSamples(trainSpeakers,100000,lengths,'trainSamples.csv')

NameError: name 'splitToDict' is not defined

In [13]:
df = pd.read_csv("trainSamples.csv")
df.loc[(df.label == 0) & (df.video == df.audio)]

Unnamed: 0.1,Unnamed: 0,video,audio,label,center
8647,8647,speaker045_0074,speaker045_0074,0,35
25014,25014,speaker009_0201,speaker009_0201,0,16
30185,30185,speaker032_0004,speaker032_0004,0,90


In [22]:
print(df.where((df.label == 0) & (df.video == df.audio)).count())
print(df.where((df.label == 1)).count())

video     16704
audio     16704
label     16704
center    16704
dtype: int64
video     50035
audio     50035
label     50035
center    50035
dtype: int64


In [12]:
with open('lengths.pickle', 'rb') as f: #Diccionario con clave de videos y clave de longitudes, ambas listas se corresponden
    lengths = pickle.load(f)
lengths = dict(zip(lengths["videos"],lengths["lengths"]))
testDF = pd.read_csv("C:/Users/jmmol/Desktop/LIP-RTVE/SPLITS/speaker-independent/test.csv")
testSpeakers = splitToDict(testDF["sampleID"].values.tolist(),lengths)
df = createSamples(testSpeakers,30000,lengths,'testSamples.csv')

devDF = pd.read_csv("C:/Users/jmmol/Desktop/LIP-RTVE/SPLITS/speaker-independent/dev.csv")
devSpeakers = splitToDict(devDF["sampleID"].values.tolist(),lengths)
df = createSamples(devSpeakers,30000,lengths,'devSamples.csv')