<h1> Environment Setup </h1>

In [None]:
#package installs
!E:\Python310\python.exe -m pip install --upgrade pip
!E:\Python310\Scripts\pip3.exe install moviepy pydub SpeechRecognition pyAudioAnalysis speechbrain pyannote.audio praat-parselmouth
!E:\Python310\python.exe -m pip install git+https://github.com/openai/whisper.git soundfile

In [None]:
#norms for data storage and manipulation

#ENV_FOLDER_DATA = source mp4 folder -> this is where the mp4s might be downloaded
## This should be separate so we can iterate through this easily
#ENV_FOLDER_DATA_PROC = where folders w/ processed + temp data will live

## raw per-episode data storage
#ENV_FOLDER_DATA_PROC\\eps\\#### = 3-4 digit numbered folder which represents the episode number
#ENV_FOLDER_DATA_PROC\\eps\\####\\####.mp3 = saved mp3
#ENV_FOLDER_DATA_PROC\\eps\\####\\####.wav = saved wav file
#ENV_FOLDER_DATA_PROC\\eps\\####\\wavsplit\\ = folders w/ split wav files
#ENV_FOLDER_DATA_PROC\\eps\\####\\wavsplit\\segment#-speaker#.wav = split wav file

## shared outputs from process
#ENV_FOLDER_DATA_PROC\\pickle\\####.pickle = saved metadata about the files, processing, locations, etc. all stored as a dictionary in a pickle
#ENV_FOLDER_DATA_PROC\\segmentation\\####.txt = saved speaker segmentation from diarization
#ENV_FOLDER_DATA_PROC\\speakers.json = json which identifies Joe Rogan vs Other Speaker
## 568|SPEAKER 0|Joe Rogan
## 568|SPEAKER 1|Rhonda Patrick
#ENV_FOLDER_DATA_PROC\\targetclips\\####.txt = chosen clips for use in analysis
## 10
## 11
## 12
#ENV_FOLDER_DATA_PROC\\pratt\\####-segment#-speaker#.pickle = saved pratt data in dictionaries with the 4 time series outputs

In [60]:
#Global variables
import socket
ENV_HOSTNAME = socket.gethostname()
print('ENV_HOSTNAME:' + ENV_HOSTNAME)

#store defaults for Jacob here:
ENV_FOLDER_DATA = ''
ENV_FOLDER_DATA_PROC = ''
ENV_PRATT = ''


if ENV_HOSTNAME == 'JWGamingPC':
    ENV_FOLDER_DATA = 'E:\\W4732 Computer Vision\\Final Paper Data\\'
    ENV_FOLDER_DATA_PROC = 'E:\\W4732 Computer Vision\\Final Paper Data Proc\\'
    ENV_PRATT = ''

print('ENV_FOLDER_DATA:' + ENV_FOLDER_DATA)
print('ENV_PRATT:' + ENV_PRATT)

# Create folder structure
import os
os.makedirs(ENV_FOLDER_DATA_PROC + 'segmentation', exist_ok=True)
os.makedirs(ENV_FOLDER_DATA_PROC + 'targetdf', exist_ok=True)
os.makedirs(ENV_FOLDER_DATA_PROC + 'pratt', exist_ok=True)
os.makedirs(ENV_FOLDER_DATA_PROC + 'eps', exist_ok=True)


ENV_HOSTNAME:JWGamingPC
ENV_FOLDER_DATA:E:\W4732 Computer Vision\Final Paper Data\
ENV_PRATT:


<h1> Functions </h1>

In [100]:
## Audio Processing 1 ##
# Code to generate mp3s, wavs, and segmentation
# Norm - save all filepaths as "suffixes" aka filesuf and always concatenate the ENV_FOLDER_DATA or the ENV_FOLDER_DATA_PROC

#Utility Function
## 1) Store all the metadata et al from the functions into a dictionary, which then gets saved to a blob
## 2) Save and load blob
## 3) Figure out names of relevant files and relevant folder structure
## 4) Download episodes from archive.org

#Audio Functions
## 1) Split MP4 to MP3
## 2) MP3 to WAV
## 3) WAV to speaker identification and time splits + record file
## 4) WAV splits into individual files
## 5) Figure out which segments to analyze w/ video (skip first and last segment from the speaker)
## 6) Take first second of the segment and produce Pratt time series (0.1 second intervals)
## 6a+b+c+d) Pitch + Intensity + Harmonics + Jitter 

#Library imports
import moviepy
import moviepy.editor
from pydub import AudioSegment
from pyannote.audio import Pipeline
import csv
import glob

#extract audio into mp3
#https://medium.com/featurepreneur/extracting-audio-from-video-using-pythons-moviepy-library-e351cd652ab8

def split_mp4_to_mp3(filepath_mp4 , filepath_mp3):
    # Load the video clip
    video_clip = moviepy.editor.VideoFileClip(filepath_mp4)

    # Extract the audio from the video clip
    audio_clip = video_clip.audio

    # Write the audio to a separate file
    audio_clip.write_audiofile(filepath_mp3)

    # Close the video and audio clips
    audio_clip.close()
    video_clip.close()

def split_mp3_to_wav(filepath_mp3, filepath_wav):
    #read mp3
    mp3_clip = AudioSegment.from_mp3(filepath_mp3)
    mp3_clip.export(filepath_wav, format="wav")
    del mp3_clip

def speaker_diarization(filepath_wav,filepath_segmentation):
    #perform speaker diarization (lingo for "speaker recognition")
    #https://medium.com/@gil.shomron/whos-talking-speaker-diarization-and-emotion-recognition-in-radio-3e9623baeb2c

    pipeline = Pipeline.from_pretrained('pyannote/speaker-diarization-3.1',
                                        use_auth_token='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')

    diarization = pipeline(filepath_wav)
    #for turn, _, speaker in diarization.itertracks(yield_label=True):
    #  print('start={.1f} stop={.1f} speaker_{}'.format(turn.start,
    #                                                   turn.end,
    #                                                   speaker))    

    # Dump to file in an RTTM format
    #with open(ENV_FOLDER_DATA_PROC + '568\\segment.txt', 'w') as rttm:
    #    diarization.write_rttm(rttm)
    list_diarization_data = []
    i = 0
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        list_temp = []
        list_temp.append(str(i))
        list_temp.append(str(speaker))
        list_temp.append(str(turn.start))
        list_temp.append(str(turn.end))
        list_diarization_data.append(list_temp)
        i += 1
    with open(filepath_segmentation, "w", newline='\n') as f:
        writer = csv.writer(f, delimiter='|',  quoting=csv.QUOTE_MINIMAL)
        writer.writerows(list_diarization_data)
    #took 81 minutes

def wav_file_splitting(filepath_wav, filepath_segmentation,folderpath_wavsplit):
    #use cutoffs to split wav file into sections
    #https://stackoverflow.com/questions/51622865/break-up-a-wav-file-by-timestamp

    listdict_data = []
    #read csv file
    with open(filepath_segmentation, newline='\n') as csv_file:
        reader = csv.reader(csv_file, delimiter='|')
        for row in reader:
            dict_temp = {}
            dict_temp['str_segment'] = row[0]
            dict_temp['speaker'] = row[1]
            dict_temp['sec_start'] = row[2]
            dict_temp['sec_end'] = row[3]
            listdict_data.append(dict_temp)
    
    #split 
    audio = AudioSegment.from_wav(filepath_wav)
    for  idx,dict_data in enumerate(listdict_data):
        start = int(float(dict_data['sec_start'])*1000)  #pydub works in millisec
        end = int(float(dict_data['sec_end']) * 1000) #pydub works in millisec
        audio_chunk=audio[start:end]
        audio_chunk.export( folderpath_wavsplit + dict_data['str_segment'] + '-' + dict_data['speaker'] + ".wav", format="wav")


def process_mp4s_for_processing(filesuf_mp4, recalc = False):
    #get number after # but before space afterwards
    str_epnum_temp = filesuf_mp4.split('#')[1]
    str_epnum = str_epnum_temp.split(' ')[0]
    
    #generate filepaths
    filepath_mp4 = ENV_FOLDER_DATA + filesuf_mp4
    print('Episode='+ str_epnum + ' at ' + filepath_mp4)
    folderpath_eps = ENV_FOLDER_DATA_PROC + 'eps\\' + str_epnum + '\\'
    os.makedirs(folderpath_eps, exist_ok=True)
    filepath_mp3 = folderpath_eps + str_epnum + '.mp3'
    filepath_wav = folderpath_eps + str_epnum + '.wav'
    folderpath_wavsplit = folderpath_eps + 'wavsplit\\'
    os.makedirs(folderpath_wavsplit, exist_ok=True)
    #filepath_pickle = ENV_FOLDER_DATA_PROC + 'pickle\\' + str_epnum + '.pickle'
    filepath_segmentation = ENV_FOLDER_DATA_PROC + 'segmentation\\' + str_epnum + '.psv'
    #filepath_targetdf = ENV_FOLDER_DATA_PROC + 'targetdf\\' + str_epnum + '.pickle'
    #filepath_pratt = ENV_FOLDER_DATA_PROC + 'pratt\\' + str_epnum + '.pickle'

    #check if mp3 exists - if it doesn't, create it
    if not os.path.exists(filepath_mp3):
        split_mp4_to_mp3(filepath_mp4,filepath_mp3)
    if not os.path.exists(filepath_mp3):
        print('Failed to create MP3:'+ filepath_mp3)
        return -1
    #check if wav exists - if it doesn't, create it
    if not os.path.exists(filepath_wav):
        split_mp3_to_wav(filepath_mp3,filepath_wav)
    #fail if process fails to produce the expected output
    if not os.path.exists(filepath_wav):
        print('Failed to create WAV:'+ filepath_wav)
        return -1
    
    #check if segmentation exists - if it doesn't, create it
    if not os.path.exists(filepath_segmentation):
        speaker_diarization(filepath_wav,filepath_segmentation)
    else:
        print("Speaker diarization exists:" + filepath_segmentation)
    #fail if process fails to produce the expected output
    if not os.path.exists(filepath_segmentation):
        print('Failed to create segmentation:'+ filepath_segmentation)
        return -1
    
    #check if split wav files exist - if it doesn't, create it
    if len(glob.glob(folderpath_wavsplit + '*')) < 10:
        wav_file_splitting(filepath_wav, filepath_segmentation,folderpath_wavsplit)

    
    

In [124]:
## Audio Processing 2 ##
# Code to generate target clips, pratt pickles
import parselmouth
from parselmouth.praat import call
from pydub import AudioSegment
import glob
import numpy as np
import pickle
#AudioSegment.converter = "C:\\ProgramData\\chocolatey\\lib\\ffmpeg-full\\tools\\ffmpeg\\bin\\ffmpeg.exe"
#AudioSegment.ffmpeg = "C:\\ProgramData\\chocolatey\\lib\\ffmpeg-full\\tools\\ffmpeg\\bin\\ffmpeg.exe"
#AudioSegment.ffprobe ="C:\\ProgramData\\chocolatey\\lib\\ffmpeg-full\\tools\\ffmpeg\\bin\\ffprobe.exe"

def process_mp4s_for_analysis(filesuf_mp4, recalc = False):
    #get number after # but before space afterwards
    str_epnum_temp = filesuf_mp4.split('#')[1]
    str_epnum = str_epnum_temp.split(' ')[0]
    
    #generate filepaths
    filepath_mp4 = ENV_FOLDER_DATA + filesuf_mp4
    print('Episode='+ str_epnum + ' at ' + filepath_mp4)
    folderpath_eps = ENV_FOLDER_DATA_PROC + 'eps\\' + str_epnum + '\\'
    os.makedirs(folderpath_eps, exist_ok=True)
    filepath_mp3 = folderpath_eps + str_epnum + '.mp3'
    filepath_wav = folderpath_eps + str_epnum + '.wav'
    folderpath_wavsplit = folderpath_eps + 'wavsplit\\'
    os.makedirs(folderpath_wavsplit, exist_ok=True)
    filepath_segmentation = ENV_FOLDER_DATA_PROC + 'segmentation\\' + str_epnum + '.psv'
    #filepath_targetdf = ENV_FOLDER_DATA_PROC + 'targetdf\\' + str_epnum + '.pickle'
    filepath_pratt = ENV_FOLDER_DATA_PROC + 'pratt\\' + str_epnum + '.pickle'

    dict_speakerdata = {}
    for i in range(5):
        speaker = 'SPEAKER_0' + str(i)
        dict_speakerdata[speaker] = {}
        dict_speakerdata[speaker]['list_pitch'] = []
        dict_speakerdata[speaker]['list_intensity'] = []
        dict_speakerdata[speaker]['list_shimmer'] = []
        dict_speakerdata[speaker]['list_jitter'] = []
        dict_speakerdata[speaker]['list_harmonics'] = []


    #get list of wav files to iterate through
    dictdict_output = {} #key = filesuf / value = dictionary
    for path_wav in glob.glob(folderpath_wavsplit + '*.wav'):
        #create data points and save into a dictionary
        dict_temp = {}
        dict_temp['path'] = path_wav
        filesuf = os.path.basename(path_wav)
        dict_temp['filesuf'] = filesuf
        dict_temp['str_segment'] = filesuf.split('-')[0]
        speaker = (filesuf.split('-')[1]).split('.')[0]
        dict_temp['speaker'] = speaker
        

        sound_total = parselmouth.Sound(path_wav)
        second_duration = call(sound_total, "Get total duration") 
        tenth_seconds = int(second_duration * 10)
        dict_temp['duration'] = second_duration
        #create dictionaries for all the values to be stored
        list_pitch = []
        list_intensity = []
        list_shimmer = []
        list_jitter = []
        list_harmonics = []

        for t in range(tenth_seconds):

            start_time = t * 0.1
            end_time = (t + 1) * 0.1
            sound = sound_total.extract_part(from_time=start_time, to_time=end_time)

            #iterate through the parts of the sound

            pointprocess = call(sound, "To PointProcess (periodic, cc)",75, 600)
            #dict_temp['pointprocess'] = pointprocess
            
            #https://parselmouth.readthedocs.io/_/downloads/en/stable/pdf/
            #gets the pitch , and sets the pitch floor to 75 and tge outcg max to 600
            try:
                pitch = call(sound, "To Pitch", 0.0, 75, 600)
                val_pitch = call(pitch, "Get mean", 0, 0,"Hertz")
            except:
                val_pitch = None
            
            ##-	For intensity extraction, set the pitch floor to 100Hz. Use ‘energy’ averaging method to get mean intensity.
            try:
                intensity = call(sound, "To Intensity", 100,0.01)
                val_intensity = call(intensity, "Get mean", 0, 0,"energy")
            except:
                val_intensity = None

            ##Shimmer
            # For shimmer, extract local shimmer only, and set period floor to 0.0001s, period ceiling to 0.02s, maximum period factor to 1.3, and maximum amplitude factor to 1.6.
            try:
                val_shimmer = call([sound, pointprocess], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
            except:
                val_shimmer = None
            
            # For jitter, extract local jitter only, and set period floor to 0.0001s, period ceiling to 0.02s, and maximum period factor to 1.3
            #Please convert from a Sound object to a PointProcess (periodic, cc) object. (#74)
            #https://github.com/drfeinberg/PraatScripts/blob/master/Measure%20Pitch%2C%20HNR%2C%20Jitter%2C%20Shimmer%2C%20and%20Formants.ipynb
            #f0min , f0max
            try:
                val_jitter = call(pointprocess, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
            except:
                val_jitter = None
            
            #-	To calculate HNR (harmonics-to-noise ratio), extract harmonicity (cc) first. Set time step to 0.01, minimum pitch to 75Hz, silence threshold to 0.1, and number of periods per window to 1.0.
            try:
                harmonics = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
                val_harmonics = call(harmonics, "Get mean", 0, 0)
            except:
                val_harmonics = None

            #add to relevant lists
            list_pitch.append(val_pitch)
            list_intensity.append(val_intensity)
            list_shimmer.append(val_shimmer)
            list_jitter.append(val_jitter)
            list_harmonics.append(val_harmonics)

            dict_speakerdata[speaker]['list_pitch'].append(val_pitch)
            dict_speakerdata[speaker]['list_intensity'].append(val_intensity)
            dict_speakerdata[speaker]['list_shimmer'].append(val_shimmer)
            dict_speakerdata[speaker]['list_jitter'].append(val_jitter)
            dict_speakerdata[speaker]['list_harmonics'].append(val_harmonics)
        #end for loop that goes per-0.1 second
        dict_temp['list_pitch'] = list_pitch
        dict_temp['list_intensity'] = list_intensity
        dict_temp['list_shimmer'] = list_shimmer
        dict_temp['list_jitter'] = list_jitter
        dict_temp['list_harmonics'] = list_harmonics
        
        dictdict_output[filesuf] = dict_temp
    #end for loop that goes through each file in the folder
    
    #calculate statistics for the speakers
    for i in range(5):
        speaker = 'SPEAKER_0' + str(i)
        dictdict_output[speaker] = {}

        dictdict_output[speaker]['mean_pitch'] = np.nanmean(list(filter(None, dict_speakerdata[speaker]['list_pitch']) ))
        dictdict_output[speaker]['median_pitch'] = np.nanmedian(list(filter(None,dict_speakerdata[speaker]['list_pitch'])))
        dictdict_output[speaker]['mean_intensity'] = np.nanmean(list(filter(None,dict_speakerdata[speaker]['list_intensity'])))
        dictdict_output[speaker]['median_intensity'] = np.nanmedian(list(filter(None,dict_speakerdata[speaker]['list_intensity'])))
        dictdict_output[speaker]['mean_shimmer'] = np.nanmean(list(filter(None,dict_speakerdata[speaker]['list_shimmer'])))
        dictdict_output[speaker]['median_shimmer'] = np.nanmedian(list(filter(None,dict_speakerdata[speaker]['list_shimmer'])))
        dictdict_output[speaker]['mean_jitter'] = np.nanmean(list(filter(None,dict_speakerdata[speaker]['list_jitter'])))
        dictdict_output[speaker]['median_jitter'] = np.nanmedian(list(filter(None,dict_speakerdata[speaker]['list_jitter'])))
        dictdict_output[speaker]['mean_harmonics'] = np.nanmean(list(filter(None,dict_speakerdata[speaker]['list_harmonics'])))
        dictdict_output[speaker]['median_harmonics'] = np.nanmedian(list(filter(None,dict_speakerdata[speaker]['list_harmonics'])))



    #save data
    with open(filepath_pratt, 'wb') as file:
        print(filepath_pratt)
        pickle.dump(dictdict_output, file, protocol=pickle.HIGHEST_PROTOCOL)



In [117]:
## Audio Processing 3 ##
# Producing a filtered object to use for training the model
# Target output is a dataframe saved into the targetdf folder
# data will be filtering out both speaker = Joe Rogan and speaker = Misc so only the guest is included
# data will also be filtering out the first instance of talking by that speaker
# data will also be filtering out clips < 1 second
# Dataframe has the following columns
# epnum -> int (episode #)
# seg -> int (segment)
# type -> p for pitch, i for intensity,  j for fitter , h for harmonics, s for shimmer 
# m -> global median for that statistic
# d0,d1, .... d99 -> values for the first 10 seconds in 0.1 second increments 

import pandas as pd
import json
import copy

def process_analysis_for_model(filesuf_mp4, recalc = False):
    #get number after # but before space afterwards
    str_epnum_temp = filesuf_mp4.split('#')[1]
    str_epnum = str_epnum_temp.split(' ')[0]
    int_epnum = int(str_epnum)
    
    #generate filepaths
    filepath_mp4 = ENV_FOLDER_DATA + filesuf_mp4
    print('Episode='+ str_epnum + ' at ' + filepath_mp4)
    folderpath_eps = ENV_FOLDER_DATA_PROC + 'eps\\' + str_epnum + '\\'
    os.makedirs(folderpath_eps, exist_ok=True)
    filepath_mp3 = folderpath_eps + str_epnum + '.mp3'
    filepath_wav = folderpath_eps + str_epnum + '.wav'
    folderpath_wavsplit = folderpath_eps + 'wavsplit\\'
    os.makedirs(folderpath_wavsplit, exist_ok=True)
    filepath_segmentation = ENV_FOLDER_DATA_PROC + 'segmentation\\' + str_epnum + '.psv'
    filepath_targetdf = ENV_FOLDER_DATA_PROC + 'targetdf\\' + str_epnum + '.pickle'
    filepath_pratt = ENV_FOLDER_DATA_PROC + 'pratt\\' + str_epnum + '.pickle'
    filepath_speakers = ENV_FOLDER_DATA_PROC + 'speakers.json'

    #open dict of speakers
    
    json_speakers = {}
    with open(filepath_speakers) as f:
        json_speakers = json.load(f)
    
    #determine speakers to retain
    list_speakers_keep = []

    for key,val in json_speakers[str_epnum].items():
        if val == 'Joe Rogan':
            continue
        if val == 'Misc':
            continue
        list_speakers_keep.append(key)

    #declare default dictionary
    dict_default = {}
    dict_default['epnum'] = int_epnum
    dict_default['seg'] = 0
    dict_default['type'] = ''
    dict_default['m'] = 0
    for i in range(100):
        dict_default['d' + str(i)] = 0

    #declare empty dataframe with the 44 columns
    df = pd.DataFrame( columns= list(dict_default.keys()) )

    #open up pratt pickle
    dictdict_output = None
    with open(filepath_pratt, 'rb') as file:
        print(filepath_pratt)
        dictdict_output = pickle.load(file)
    
    #iterate through each segment
    for key,val in dictdict_output.items():
        #if there's no dash in the key, then the key is one of the aggregate values
        if '-' not in key:
            continue
        filesuf = val['filesuf']
        str_segment = val['str_segment']
        int_segment = int(str_segment)
        speaker = val['speaker']

        #skip if it's less than 1 second
        if val['duration'] < 1.0:
            continue

        #skip data if it's Joe Rogan or Misc
        if speaker not in list_speakers_keep:
            continue

        dict_temp = copy.deepcopy(dict_default)
        dict_temp['seg'] = int_segment
        list_dict = []

        #make a new copy for pitch, intensity, shimmer, jitter, harmonics
        #pitch
        dict_pitch = copy.deepcopy(dict_temp)
        dict_pitch['type'] = 'p'
        dict_pitch['m'] = dictdict_output[speaker]['median_pitch']
        for i, data in enumerate(val['list_pitch']):
            dict_pitch['d'+str(i)] = data
        list_dict.append(dict_pitch)


        #intensity
        dict_intensity = copy.deepcopy(dict_temp)
        dict_intensity['type'] = 'i'
        dict_intensity['m'] = dictdict_output[speaker]['median_intensity']
        for i, data in enumerate(val['list_intensity']):
            dict_intensity['d'+str(i)] = data
        list_dict.append(dict_intensity)


        #shimmer
        dict_shimmer = copy.deepcopy(dict_temp)
        dict_shimmer['type'] = 's'
        dict_shimmer['m'] = dictdict_output[speaker]['median_shimmer']
        for i, data in enumerate(val['list_shimmer']):
            dict_shimmer['d'+str(i)] = data
        list_dict.append(dict_shimmer)


        #jitter
        dict_jitter = copy.deepcopy(dict_temp)
        dict_jitter['type'] = 'j'
        dict_jitter['m'] = dictdict_output[speaker]['median_jitter']
        for i, data in enumerate(val['list_jitter']):
            dict_jitter['d'+str(i)] = data
        list_dict.append(dict_jitter)


        #harmonics
        dict_harmonics = copy.deepcopy(dict_temp)
        dict_harmonics['type'] = 'h'
        dict_harmonics['m'] = dictdict_output[speaker]['median_harmonics']
        for i, data in enumerate(val['list_harmonics']):
            dict_harmonics['d'+str(i)] = data
        list_dict.append(dict_harmonics)

        #turn list of dicts into dataframe
        df_temp = pd.DataFrame(list_dict, columns=list(dict_default.keys()) )
        df = pd.concat([df,df_temp], ignore_index=True)
    
    #end loop through data
    
    #save data
    with open(filepath_targetdf, 'wb') as file:
        pickle.dump(df, file, protocol=pickle.HIGHEST_PROTOCOL)





In [99]:
# Test audio processing
# using test video #568 Dr. Rhonda Patrick
# https://archive.org/download/jre-001-837/JRE_001-837/
# https://archive.org/download/jre-001-837/JRE_001-837/Joe%20Rogan%20Experience%20%23568%20-%20Dr.%20Rhonda%20Patrick.mp4

filesuf_mp4 = 'Joe Rogan Experience #568 - Dr. Rhonda Patrick.mp4'
process_mp4s_for_processing(filesuf_mp4)

  #speaker 2 is joe rogan
  #speaker 0 is the lady
  #speaker 1 is the sound effects 
  #time start and stop is in seconds


Episode=568 at E:\W4732 Computer Vision\Final Paper Data\Joe Rogan Experience #568 - Dr. Rhonda Patrick.mp4
Speaker diarization exists:E:\W4732 Computer Vision\Final Paper Data Proc\segmentation\568.psv


In [125]:
# Test pratt processing
filesuf_mp4 = 'Joe Rogan Experience #568 - Dr. Rhonda Patrick.mp4'
process_mp4s_for_analysis(filesuf_mp4, recalc = False)

Episode=568 at E:\W4732 Computer Vision\Final Paper Data\Joe Rogan Experience #568 - Dr. Rhonda Patrick.mp4
E:\W4732 Computer Vision\Final Paper Data Proc\pratt\568.pickle


In [126]:
# Test dataframe output
filesuf_mp4 = 'Joe Rogan Experience #568 - Dr. Rhonda Patrick.mp4'
process_analysis_for_model(filesuf_mp4, recalc = False)

Episode=568 at E:\W4732 Computer Vision\Final Paper Data\Joe Rogan Experience #568 - Dr. Rhonda Patrick.mp4
E:\W4732 Computer Vision\Final Paper Data Proc\pratt\568.pickle


In [128]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote
import urllib.request
#download videos from archive.org

#get files in https://archive.org/download/jre-001-837/JRE_001-837/
#code from https://www.geeksforgeeks.org/extract-all-the-urls-from-the-webpage-using-python/
url = 'https://archive.org/download/jre-001-837/JRE_001-837/'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a'):
    urls.append(link.get('href'))

#download files that only have a single person (no ampersand or comma) that isn't a fight companion and also between 200 and 700
#also, no partial episodes
list_dlurl = []
for url in urls:
    if url is None:
        continue
    str_temp = url.replace(', PhD','')
    if not '.mp4' in str_temp:
        continue
    if '.ia.mp4' in str_temp:
        continue
    if 'Part' in str_temp:
        continue
    if ',' in str_temp:
        continue
    str_epnum = str_temp.split('%23')[1]
    str_epnum = str_epnum.split('%20')[0]
    int_epnum = int(str_epnum)
    if int_epnum < 200 or int_epnum > 700:
        continue
    list_dlurl.append(url)

for dlurl in list_dlurl:
    # check if file already exists in downloads folder before downloading
    filename = unquote(dlurl)
    if os.path.exists(ENV_FOLDER_DATA + filename):
        print('File already exists:' + filename)
    url_final = 'https://archive.org/download/jre-001-837/JRE_001-837/' + dlurl
    print('Downloading:' + filename)
    urllib.request.urlretrieve(url_final, ENV_FOLDER_DATA + filename)


In [135]:
for dlurl in list_dlurl:
    # check if file already exists in downloads folder before downloading
    filename = unquote(dlurl)
    if os.path.exists(ENV_FOLDER_DATA + filename):
        print('File already exists:' + filename)
    url_final = 'https://archive.org/download/jre-001-837/JRE_001-837/' + dlurl
    print('Downloading:' + filename)
    urllib.request.urlretrieve(url_final, ENV_FOLDER_DATA + filename)
    
    

Downloading:Joe Rogan Experience #200 - Duncan Trussell.mp4
Downloading:Joe Rogan Experience #201 - EverLast.mp4
Downloading:Joe Rogan Experience #202 - Dom Irrera.mp4
Downloading:Joe Rogan Experience #203 - Jim Jefferies.mp4
Downloading:Joe Rogan Experience #204 - Amy Schumer.mp4
Downloading:Joe Rogan Experience #205  Neal Brennan.mp4


In [134]:
print(list_dlurl)

['Joe%20Rogan%20Experience%20%23200%20-%20Duncan%20Trussell.mp4', 'Joe%20Rogan%20Experience%20%23201%20-%20EverLast.mp4', 'Joe%20Rogan%20Experience%20%23202%20-%20Dom%20Irrera.mp4', 'Joe%20Rogan%20Experience%20%23203%20-%20Jim%20Jefferies.mp4', 'Joe%20Rogan%20Experience%20%23204%20-%20Amy%20Schumer.mp4', 'Joe%20Rogan%20Experience%20%23205%20%20Neal%20Brennan.mp4', 'Joe%20Rogan%20Experience%20%23206%20%20Eddie%20Bravo.mp4', 'Joe%20Rogan%20Experience%20%23207%20-%20Tom%20Segura%2C%20Christina%20Pazsitzky.mp4', 'Joe%20Rogan%20Experience%20%23208%20-%20Freeway%20Rick%20Ross.mp4', 'Joe%20Rogan%20Experience%20%23209%20-%20Eddie%20Ifft.mp4', 'Joe%20Rogan%20Experience%20%23210%20-%20Joey%20Diaz.mp4', 'Joe%20Rogan%20Experience%20%23212%20-%20Bryan%20Callen.mp4', 'Joe%20Rogan%20Experience%20%23213%20-%20Eddie%20Bravo.mp4', 'Joe%20Rogan%20Experience%20%23214%20-%20Duncan%20Trussell.mp4', 'Joe%20Rogan%20Experience%20%23215%20-%20Andrew%20Dice%20Clay.mp4', 'Joe%20Rogan%20Experience%20%23216%20-%20C

<h1> Appendix </h1>

In [58]:
##Appendix 1##
# Failed Speaker Diarization 1
%%script echo skipping appendix

#https://medium.com/@apparaomulpuri/speaker-diarization-in-python-a-step-by-step-guide-351a094237f2
#perform speaker diarization (lingo for "speaker recognition")
#this is a poorly performing solution

import librosa #after further analysis librosa is actually a music library - seems cool
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

def generate_speaker_labels(filepath_wav):
    audio, sr = librosa.load(filepath_wav, sr=None)
    duration = librosa.get_duration(y=audio, sr=sr)
    #print('Duration:' + str(duration))
    mfccs = librosa.feature.mfcc(y=audio, sr=sr)

    scaler = StandardScaler()
    mfccs_scaled = scaler.fit_transform(mfccs.T)
    kmeans = KMeans(n_clusters=3)  # Adjust based on the expected number of speakers
    speaker_labels = kmeans.fit_predict(mfccs_scaled)

    del audio
    del sr
    del mfccs
    del scaler
    del mfccs_scaled
    del kmeans

    return speaker_labels, duration
    
list_speakers , duration = generate_speaker_labels(filepath_testwav)

#print(len(list_speakers))
# 917952
# sample rate = 22050
# hop length = 512
#print(str( len(list_speakers) / duration))
#86.13282789423312 <- samples per second
#print(str((60*60*2) + (60 * 57) + 39))

#now that we have categorizations, let's perform cutoffs to split the speech (and video) between the speakers:
samples_per_sec = 1.0 * len(list_speakers) / duration
list_cutoffs = []
current_speaker = list_speakers[0]
temp_dict = {}
temp_dict['speaker'] = current_speaker
temp_dict['start_index'] = 0
for i,speaker in enumerate(list_speakers):
    if speaker == temp_dict['speaker'] and (i != (len(list_speakers) - 1)):
        continue
    temp_dict['end_index'] = i - 1
    list_cutoffs.append(temp_dict)
    temp_dict = {}
    temp_dict['speaker'] = speaker
    temp_dict['start_index'] = i

from pydub import AudioSegment

audio = AudioSegment.from_wav(filepath_testwav)


for  idx,dict_tim in enumerate(list_cutoffs):
    start = int((dict_tim['start_index'] / samples_per_sec) * 1000) #pydub works in millisec
    end = int((dict_tim['end_index'] / samples_per_sec) * 1000) #pydub works in millisec
    audio_chunk=audio[start:end]
    audio_chunk.export( ENV_FOLDER_DATA_PROC + '568\\' + str(end) + '-'  + str(dict_tim['speaker'])  + ".wav", format="wav")


UsageError: Line magic function `%%script` not found.


In [59]:
##Appendix 2##
# Failed Speaker Diarization 2
%%script echo skipping appendix

#perform speaker diarization (lingo for "speaker recognition")
#attempt 2
#This is using old code thus will not run
#https://picovoice.ai/blog/speaker-diarization-in-python/
#https://speechbrain.github.io/ 
#https://colab.research.google.com/drive/1nMKHOTTROwQitOXQEYq35lvv7nyTOlpe?usp=sharing
from simple_diarizer.diarizer import Diarizer

diar = Diarizer(
        embed_model='ecapa', # supported types: ['xvec', 'ecapa']
        cluster_method='sc', # supported types: ['ahc', 'sc']
        window=1.5, # size of window to extract embeddings (in seconds)
        period=0.75 # hop of window (in seconds)
    )
segments = diar.diarize(filepath_testwav, 
                        num_speakers=None,
                        threshold=1e-1,
                        outfile=ENV_FOLDER_DATA_PROC + '568\\segment.txt')

UsageError: Line magic function `%%script` not found.


In [None]:
# Appendix 3 - code to transcribe text

# Import the required libraries
import speech_recognition as sr  # Library for speech recognition
import os  # Library for interacting with the operating system
from pydub import AudioSegment  # Library for working with audio files
from pydub.silence import split_on_silence  # Function for splitting audio files based on silence

#https://stackoverflow.com/questions/65489705/transcribing-mp3-to-text-python-riff-id-error
recognizer = sr.Recognizer()

def transcribe_large_audio_file(prefix,path_mp3):
    """
    Split audio into chunks and apply speech recognition
    """
    # Load audio file with pydub
    audio = AudioSegment.from_mp3(path_mp3)
    # Split audio at silent parts with duration of 700ms or more and obtain chunks
    audio_chunks = split_on_silence(audio, min_silence_len=600, silence_thresh=audio.dBFS-14, keep_silence=600)

    # Create a directory to store audio chunks
    chunks_dir = ENV_FOLDER_DATA_PROC + prefix
    if not os.path.isdir(chunks_dir):
        os.mkdir(chunks_dir)

    full_text = ""
    failed_attempts = 0
    # Process each audio chunk
    for i, chunk in enumerate(audio_chunks, start=1):
        # Save chunk in the directory
        chunk_file_name = os.path.join(chunks_dir, f"chunk{i}.wav")
        chunk.export(chunk_file_name, format="wav")
        # Recognize audio from the chunk
        with sr.WavFile(chunk_file_name) as src:
            listened_audio = recognizer.listen(src)
            # Convert audio to text
            try:
                text = recognizer.recognize_whisper(listened_audio)
            except Exception  as e:
                failed_attempts += 1
                print(e)
            else:
                failed_attempts = 0
                text = f"{text.capitalize()}. "
                print(chunk_file_name, ":", text)
                full_text += text
    # Return the transcription for all chunks
    return full_text

def split_and_transcribe(prefix,filepath_mp3):
    # Define the output directory
    output_dir = ENV_FOLDER_DATA_PROC + prefix

    # Create the output directory if it does not exist
    os.makedirs(output_dir, exist_ok=True)

    # Iterate through all .mp3 files in the directory and transcribe them
    with open(os.path.join(output_dir, '568.txt'), 'w') as result_file:
        print(f"Processing {filepath_mp3}")
        try:
            # Transcribe the audio file
            transcription = transcribe_large_audio_file(prefix,filepath_mp3)
        except LookupError as error:
            # If there is an error, skip the file and continue with the next one
            print(f"Error on {filepath_mp3} due to: {error}")
        # Save the transcription to a text file with the same name as the audio file
        txt_file_path = os.path.join(output_dir, f"{os.path.splitext(filepath_mp3)[0]}.txt")
        with open(txt_file_path, 'w', encoding="utf-8") as txt_file:
            txt_file.write(transcription)
        # Print the transcription and the path to the saved text file
        print(transcription)
        print(f"Transcription saved to {txt_file_path}")
        # Save the transcription to the result


split_and_transcribe('568',filepath_testmp3)
#splitting file into pieces
#https://stackoverflow.com/questions/67334379/cut-mp4-in-pieces-python