# Installs

In [None]:
%%capture
!pip install transformers
!pip install pyannote.audio
!pip install git+https://github.com/openai/whisper.git 
!pip install filetype

In [None]:
!sudo apt install ffmpeg

# Imports & paths

In [None]:
# Import necessary library

# For managing audio file
import librosa

# new approach to handle sound files
import soundfile as sf

#Importing Pytorch
import torch


#Importing Wav2Vec
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
# import transfoerms pipeline
from transformers import pipeline

import pandas as pd
import numpy as ntype_of_file
import os
from pyannote.audio import Pipeline
import time

# OpenAIs whisper
import whisper

from tqdm.notebook import tqdm
import filetype
import shutil
import subprocess
import collections
from functools import reduce

In [None]:
# import torchaudio -> session crashes when being imported with torch in one go?
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T

## Create dirs & paths to data

In [None]:
!mkdir chunks
!mkdir data
!mkdir waveforms
!mkdir transformed_files
!mkdir csv_out

In [None]:
from google.colab import drive
drive.mount("/content/drive/")

In [None]:
path = ""

# Models and tokenizers

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
%%capture
# Importing Wav2Vec pretrained model
tokenizer_FB_wav2vec2 = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model_FB_wav2vec2 = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# models to choose from:
# tiny, base, small, medium (5GB RAM), large (10GB RAM)
model_OA_whisper = whisper.load_model("base").to(device)

# pyannote diarization pipeline
pipeline_pyannote_diarization = Pipeline.from_pretrained("pyannote/speaker-diarization")

# emotion prediction distilroberta-base (Jochen & Samuel)
emotion_text_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True, device=0)
# pipeline speech to emotion (fine-tuned wav2vec)
pipeline_ste = pipeline(model="harshit345/xlsr-wav2vec-speech-emotion-recognition", return_all_scores=True, device=0)

# Functions

In [None]:
def split_filepath_path_name_type(file_path):

  '''
  This function:
  0. takes in a filepath
  1. splits the filepath into path_to_file, name_of_file and type_of_file
  2. returns them
  '''

  # base splits 
  slash_split = file_path.split("/")
  dot_split = file_path.split(".")

  #  path of given file
  path_of_file = "/".join(slash_split[0:-1]) + "/"
  name_file_raw = slash_split[-1].split(".")

  # name of the given file
  name_of_file = name_file_raw[0]

  # type of the given file
  type_of_file = name_file_raw[-1]

  return path_of_file, name_of_file, type_of_file

In [None]:
# write loop that combines above functions to transcribe entire audio file
def chunk_audio(file_path, output_path, output_type, chunks_in_seconds):

  # split filepath
  # last three bools for controlling what parts of splitted path to return
  # -> here path and name of file
  path_of_file, name_of_file, _ = split_filepath_path_name_type(file_path)

  # check length of audio file
  y, sr = librosa.load(file_path)
  duration = librosa.get_duration(y, sr)
  print(duration, int(duration))

  # number of iterations we will go through
  counter =   int(duration) // int(chunks_in_seconds)
  print(counter)


  # chunk audio file
  for i in range(counter):
    y, sr = librosa.load(file_path, offset=i*chunks_in_seconds, duration=chunks_in_seconds)
    # save chunk
    sf.write(output_path + name_of_file +f"_offset_{i*chunks_in_seconds}_" + output_type, y, sr)


In [None]:
def change_encoding(file_path, output_path, output_type):
  '''
  This function:
  0. Takes in an audio file(string), output_path(string), duration_in_seconds(double) to cut it to, loads it
  1. Changes its encoding to .wav format
  2. saves it
  '''

  # split filepath
  # last three bools for controlling what parts of splitted path to return
  # -> here path and name of file
  path_of_file, name_of_file, _ = split_filepath_path_name_type(file_path)

  # 0: load file
  y, sr = librosa.load(file_path)
  # save first duration of audio file as .wav
  #librosa.output.write_wav(output_path + name_of_file +f'_file_trim_{duration_in_seconds}s.wav', y, sr)

  # 1,2:
  sf.write(output_path + name_of_file + output_type, y, sr)

In [None]:
def detect_filetype(path, output_path, output_type=".wav"):

  """
  This function:
  0: takes in a path to a folder
  1: iterates over the given files
  2: if video: extracts waveform and saves in new folder
  3: if audio (not waveform): converts to waveform and saves in new folder
  4: if waveform: save in new folder
  """

  # iterate over folder
  for filename in os.listdir(path):
    
    filepath = f"{path}{filename}"
    # check filetype
    kind = filetype.guess(filepath)

    # file mime
    file_mime = kind.mime.split("/")[0]

    print(file_mime, kind.extension)

    # is video? -> extract wav and save in data and continue
    if file_mime == "video":
      print("is video")
      # define filename
      filename_split = filename.split(".")[0]
      
      # execute ffmpeg conversion
      !ffmpeg -i "{path}{filename}" "{output_path}{filename_split}{output_type}"

      continue

    # is audio? -> convert to wav and save in data and continue
    elif file_mime == "audio":
      print("is audio")

      # convert audio to wav if not already in that format
      if kind.extension != "wav":
        print("change encoding")
        # change encoding to wav
        change_encoding(filepath, output_path, output_type)
        print("changed encoding")

      # if already in wav format: save in data
      else:
        print("copy wav file")
        shutil.copy(filepath, output_path)

      continue

    # is neither audio nor video? -> exclaim and continue
    elif file_mime != "video" and file_mime != "audio":
      print(f"Filetype is not supported! - {filepath}")
      continue

    # is none? exclaim and continue
    elif kind is None:
      print(f"Filetype is none! - {filepath}")


In [None]:
def diarize(input_file):

  """
  This function:
  0: takes in a chunked .wav file
  1: performs diarization on it and returns resultung data
  """
  return pipeline_pyannote_diarization(input_file)

In [None]:
def format_pyannote_instance(instance, round_by):
  """
  This function:
  0: takes in a pyannote diarization instance
  1: processes the containing information into a list of tuples in form (start, end, speaker), rounds start, end by round_by 
  2: returns that list
  """
  return [(f"{turn.start:.2f}", f"{turn.end:.2f}", f"{speaker}") for turn, _, speaker in instance.itertracks(yield_label=True)]

In [None]:
def count_words(text, clip_length):

  """
  This function: 
  0: takes in an input string
  1: counts all words
  2: returns wordcount, unique words and their counts, wordcount/clip_length
  """
  # string into list
  words = text.split(" ")
  # clean single words
  words = [word.strip(" ,.!?§$%&/()=+-#@€°*") for word in words]

  # initialize unique words set
  word_set = {word:0 for word in set(words)}
  # count words
  for word in words:
    word_set[word] += 1
  # calculate number of words
  numb_words = len(words)
  return numb_words, word_set, numb_words/clip_length

In [None]:
def transcribe_audio_FB_wav2vec2(input_file, sample_rate):

  '''
  This function:
  0. takes in an audio file
  1. changes its sample rate
  2. tokenizes the data
  3. feeds the data into a previously defined model
  4. predicts and transcripts outputs
  5. returns transcription
  Note: needs input audio to be of smaller chunk size
  '''

  # 1.
  audio, sr = librosa.load(input_file, sr=sample_rate)
  # 2.
  tokenized_values = tokenizer_FB_wav2vec2(audio, return_tensors="pt").input_values
  # 3.
  logits = model_FB_wav2vec2(tokenized_values).logits
  # 4.
  prediction = torch.argmax(logits, dim=-1)
  transcription = tokenizer_FB_wav2vec2.batch_decode(prediction)[0]
  # 5.
  return transcription

In [None]:
def transcribe_audio_OA_whisper(input_file):

  """
  This function:
  0: takes in a path to an audio file
  1: feeds it into the chosen whisper model 
  2: determines language and text, returns both
  """


  # load the input_file
  audio = whisper.load_audio(input_file)

  # make log-Mel spectrogram and move to the same device as the model
  mel = whisper.log_mel_spectrogram(audio).to(model_OA_whisper.device)

  # detect the spoken language
  _, probs = model_OA_whisper.detect_language(mel)

  # decode the audio
  options = whisper.DecodingOptions()

  # get the text from returned tokens with "ret.text"
  # get language prediction: max(probs, key=probs.get)
  # returns text, predicted language and mel-spectogram
  return whisper.decode(model_OA_whisper, mel, options), probs, mel

In [None]:
# emotion prediction on extracted text
def emotion_prediction_text(text):
  """
  This function:
  0: takes in an extracted string
  1: performs emotion prediction on it and returns those predicted values
  """
  return emotion_text_classifier(text)               

In [None]:
# general metadata extraction
def metadata(path):

  """
  This file:
  0: loads a wave file
  1: performs pitch frequency detection
  2-(n-1): performs various other metadata extraction procedures
  3: returns all extracted data
  """
  # load file
  wave, sr = torchaudio.load(path, normalize=True)

  # detect pitch
  # testing pitch (torchaudio): https://pytorch.org/tutorials/beginner/audio_feature_extractions_tutorial.html
  pitch = F.detect_pitch_frequency(wave, sr)



  # calculate spectogram
  # spectogram, siehe: https://pytorch.org/audio/stable/tutorials/audio_feature_extractions_tutorial.html#sphx-glr-tutorials-audio-feature-extractions-tutorial-py
  # https://en.wikipedia.org/wiki/Spectrogram
  transform = torchaudio.transforms.Spectrogram()
  spect = transform(wave)
  return pitch, spect

In [None]:
# function that searches for ad mentions
def admention(ads, text):

  """
  This function:
  0: takes in a list of ads and the extracted text of a current clip
  1: compares the extract text with the mentioned ads
  2: returns a dictionary in {ad_1: ad_c1_counter, .., ad_n: ad_n_counter}
  """

  ad_dict = collections.Counter(text)
  return  [(k,v) for (k,v) in ad_dict.items() if k in ads]

def process_string(text):

  """
  This function:
  0: takes in a string
  1: lowers the string
  2: strips the string of unwated chars and returns it
  """
  return text.lower().strip(" .:,;!?1234567890")

In [None]:
# note: first draft, meant to incorporate more functionality over time if useful and condense previous code
# class to pre-process data and create a dataset instance
class Dataset(torch.utils.data.Dataset):
  def __init__(self, filepath, path_chunks, chunk_size):
    self.filepath = filepath
    self.path_chunks = path_chunks
    self.chunk_size = chunk_size

    # iterate over files in folder
    for filename in os.listdir(filepath):

      # create folder for each top-file
      new_dir = self.path_chunks  + filename.split(".")[0]
      
      # create sub-folders, each containing the respective chunks of the file
      os.makedirs(new_dir, exist_ok=True)
      # chunks files and exports each chunk as a .wav file into given output folder path_chunks
      chunk_audio(filepath + filename, f"{new_dir}/" , ".wav", chunk_size)

  def __getitem__(self):
    # returns all chunks as filepaths
    for subfolder in os.listdir(self.path_chunks):
      for filename in os.listdir(self.path_chunks+ "/"+subfolder):
        print(self.path_chunks + subfolder + "/" + filename)
    return sorted([self.path_chunks + subfolder + "/" + filename for subfolder in os.listdir(self.path_chunks) for filename in os.listdir(self.path_chunks+ "/"+subfolder)])
  
  def __len__(self):
    # returns the number of total chunks the datset contains
    return len(os.listdir(self.path_chunks))
dataset = Dataset(filepath="/content/drive/My Drive/Colab Notebooks/her/data/ger/", path_chunks="/content/chunks4/", chunk_size=30.0)

In [None]:
def extract(path, path_chunks, csv_out_path, csv_name, sample_rate, chunk_length, round_by, ads):

  df = pd.DataFrame(columns=["filename",
                             "transcribed_FB_wav2vec2_text",
                             "transcribe_OA_whisper_lang",
                             "transcribed_OA_whisper_X_to_X",
                             "transcribed_OA_whisper_X_to_en",
                             "transcribe_OA_whisper_mel",
                             "pyannote_diarization_start",
                             "pyannote_diarization_end",
                             "pyannote_diarization_speaker",
                             "speaker_detected",
                             "wordcount",
                             "unique_words",
                             "words_per_s",
                             "pitch",
                             "spectogram",
                             "duration_in_s",
                             "sample_rate",
                             "size_kB"])
  
  df_emotion = pd.DataFrame(columns=["filename",
                                     "transcribed_OA_whisper",
                                     "anger_text",
                                     "disgust_text",
                                     "fear_text",
                                     "joy_text",
                                     "neutral_text",
                                     "sadness_text",
                                     "surprise_text",
                                     "disgust_audio",
                                     "fear_audio",
                                     "happiness_audio",
                                     "sadness_audio",
                                     "anger_audio"])

  # creates a custom df based on ads that are supposed to be searched for
  df_admentions = pd.DataFrame(columns=["filename"].append([col for col in ads]))

  # all cut files in dir in list
  #item_list = sorted(os.listdir(path))

  # create a Dataset instance
  dataset = Dataset(filepath=path, path_chunks=path_chunks, chunk_size=30.0)
  print(dataset.__getitem__())
  item_list = dataset.__getitem__()


  # iterate over those files
  for index in tqdm(range(len(item_list))):

      # size of file in kB
      file_size = os.path.getsize(item_list[index])/1024

      # metadata extraction
      # pitch: pitch-freq
      # spect: spectogram
      pitch, spect = metadata(item_list[index])
      
      # extract text, lang using different models
      # FAIRs wav2vec2 960H
      transcribed_FB_wav2vec2_text = transcribe_audio_FB_wav2vec2(item_list[index], sample_rate)
      # OpenAIs whisper
      transcribed_OA_whisper_text, lang, mel = transcribe_audio_OA_whisper(item_list[index])
      # translate options for whisper 
      options = dict(language="en", beam_size=5, best_of=10)
      translate_options = dict(task="translate", **options)

      # if language is in en, perform adcounts on it
      if max(lang, key=lang.get) == "en": 
        # split sentence into words
        words = text_X_to_en.split(" ")
        print(words)
          # call admentions - returns [(ad, count)]
          # feed into as lower case
        ad_list = admention(ads=list(map(lambda x: process_string(x), ads)), text=list(map(lambda x: process_string(x), words)))
          # write into df
        df_admentions.at[index, "filename"] = item_list[index].split("/")[-1]
        for (ad, number) in ad_list:
          df_admentions.at[index, ad] = number
        
      
      # if  the detected language is not in english, translate to english as well
      if max(lang, key=lang.get) != "en":
        text_X_to_en = model_OA_whisper.transcribe(item_list[index], **translate_options)["text"]
        df.at[index, "transcribed_OA_whisper_X_to_en"] = text_X_to_en
        # split sentence into words
        words = text_X_to_en.split(" ")
        print(words)
        # call admentions - returns [(ad, count)]
         # feed into as lower case
        ad_list = admention(ads=list(map(lambda x: process_string(x), ads)), text=list(map(lambda x: process_string(x), words)))
        # write into df
        df_admentions.at[index, "filename"] = item_list[index].split("/")[-1]
        for (ad, number) in ad_list:
          df_admentions.at[index, ad] = number
        


      # diarization using pyannote
      # creates a pyannote diarization instance of the current file and returns it
      dia_instance = diarize(item_list[index])
      # returns a list of triples: (start, end, speaker), type: str
      diarized_list = format_pyannote_instance(dia_instance, round_by)

      # emotion prediction from audio
      emotion_preds_audio = pipeline_ste(item_list[index])
      df_emotion.at[index, "disgust_audio"] = emotion_preds_audio[0]["score"]
      df_emotion.at[index, "fear_audio"] = emotion_preds_audio[1]["score"]
      df_emotion.at[index, "happiness_audio"] = emotion_preds_audio[2]["score"]
      df_emotion.at[index, "sadness_audio"] = emotion_preds_audio[3]["score"]
      df_emotion.at[index, "anger_audio"] = emotion_preds_audio[4]["score"]

      # wordcount, unique words, words/s
      if max(lang, key=lang.get) == "en":
        wordcount, uniques, words_per_s = count_words(text=transcribed_OA_whisper_text.text, clip_length=chunk_length)
      else:
        wordcount, uniques, words_per_s = count_words(text=text_X_to_en, clip_length=chunk_length)

      # emotion prediction from text (english)
      if max(lang, key=lang.get) == "en":
        text = emotion_prediction_text(transcribed_OA_whisper_text.text)
        df_emotion.at[index, "transcribed_OA_whisper"] = transcribed_OA_whisper_text.text
        # if original text not in english, use translation
      else:
        text = emotion_prediction_text(text_X_to_en)
        df_emotion.at[index, "transcribed_OA_whisper"] = text_X_to_en

      # write emotion prediction into the emotion df
      df_emotion.at[index, "filename"] = item_list[index].split("/")[-1]
      df_emotion.at[index, "anger_text"] = text[0][0]["score"]
      df_emotion.at[index, "disgust_text"] = text[0][1]["score"]
      df_emotion.at[index, "fear_text"] = text[0][2]["score"]
      df_emotion.at[index, "joy_text"] = text[0][3]["score"]
      df_emotion.at[index, "neutral_text"] = text[0][4]["score"]
      df_emotion.at[index, "sadness_text"] = text[0][5]["score"]
      df_emotion.at[index, "surprise_text"] = text[0][6]["score"]

      # write data into df
      df.at[index, "filename"] = item_list[index].split("/")[-1]
      df.at[index, "transcribed_FB_wav2vec2_text"] = transcribed_FB_wav2vec2_text
      df.at[index, "transcribed_OA_whisper_X_to_X"] = transcribed_OA_whisper_text.text
      df.at[index, "transcribe_OA_whisper_lang"] = max(lang, key=lang.get)
      df.at[index, "transcribe_OA_whisper_mel"] = mel
      # write diarization data into df
      try:
        df.at[index, "pyannote_diarization_start"] = [item[0] for item in diarized_list]
        df.at[index, "pyannote_diarization_end"] = [item[1] for item in diarized_list]
        df.at[index, "pyannote_diarization_speaker"] = [item[2] for item in diarized_list]
        df.at[index, "speaker_detected"] = "yes"
      except:
        df.at[index, "speaker_detected"] = "no"
        print("No speaker detected", )

      df.at[index, "wordcount"] = wordcount
      df.at[index, "unique_words"] = uniques
      df.at[index, "words_per_s"] = words_per_s
      df.at[index, "pitch"] = pitch
      df.at[index, "spectogram"] = str(spect)
      df.at[index, "duration_in_s"] = chunk_length
      df.at[index, "sample_rate"] = sample_rate
      df.at[index, "size_kB"] = file_size

    
  # export df
  df.to_csv(csv_out_path + csv_name + ".csv")

  # export emotion df
  df_emotion.to_csv(csv_out_path + csv_name + "_emotion" + ".csv")
  
  # export admentions df
  df_admentions.to_csv(csv_out_path + csv_name + "_admentions" + ".csv")

  # uber-df
  dfs = [df, df_emotion, df_admentions]
  # merge dfs iteratively 
  # note: reduce reduces the iterable iteratively to a single variable/value, applying left -> right
  d_merged = reduce(lambda left, right: pd.merge(left, right, on=["filename"], how="outer"), dfs).fillna("na")
  # export uber-df
  d_merged.to_csv(csv_out_path + csv_name + "_aggregated_data" + ".csv")

# Tests

In [None]:
# test chunk audio
start = time.time()
chunk_len = 30.0
chunk_audio(path,
             "/content/chunks/",
             ".wav",
             chunk_len)
end = time.time()
print(round(end-start, 3))

In [None]:
# test chang_encoding function
start = time.time()

change_encoding(path,
                 "/content/transformed_files/",
                 ".wav")
end = time.time()
print(round(end-start, 3))

# Run

In [None]:
# test: full extraction function
start = time.time()

extract(path=path, path_chunks="/content/chunks/", csv_out_path="/content/csv_out/", csv_name="ASA_extraction_V1", sample_rate=16000, chunk_length=30.0, round_by=2, ads=[])

end = time.time()
print(round(end-start, 3))

In [None]:
l = ["A", "B"]
l2 = list(map(lambda x: x.lower(), l))
print(l2)