# Spoken language detection

## 1. 

In [1]:
import os
import random
import numpy as np
import tensorflow as tf
import librosa
import matplotlib.pyplot as plt
import pandas as pd
import IPython

In [2]:
PL_LANG_FILE = os.path.join('languages_audio', 'languages_audio_wav', 'pl', 'clips', 'common_voice_pl_20547814.wav')
PL_LANG_FILE

In [3]:

file_content = tf.io.read_file(PL_LANG_FILE) # Load encoded wav file
type(file_content)
   

In [4]:
wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1)

In [5]:
wav

In [6]:
sample_rate.numpy() # 48 kHz próbkowanie należy zmienic

In [7]:
wav = tf.squeeze(wav, axis=-1) # axis=-1 żeby usunąć ostatni wymiar który jest równy 1

In [8]:
wav

In [9]:
wav = librosa.resample(wav.numpy(), orig_sr=sample_rate.numpy(), target_sr=16_000)

In [10]:
wav = tf.convert_to_tensor(wav, dtype=tf.float32)

In [11]:
wav_example = wav
wav_example

In [12]:
plt.plot(wav)

In [13]:
IPython.display.Audio(PL_LANG_FILE)

### 2. Create Dataset

In [14]:
PL_DIR = os.path.join('languages_audio', 'languages_audio_wav', 'pl', 'clips')
PL_DIR

In [15]:
# create tensorflow dataset
pl_dataset = tf.data.Dataset.list_files(PL_DIR + '\*.wav')
pl_dataset

In [16]:
pl_dataset.as_numpy_iterator().next()

In [17]:
pl_dataset.as_numpy_iterator().next()

In [18]:
df = pd.read_csv(os.path.join('languages_audio', 'languages_audio_mp3', 'pl', 'validated.tsv'), sep='\t')
df

In [19]:
df[df['gender'] == 'female_feminine']

In [20]:
df[df['gender'] == 'male_masculine']

In [21]:
df.groupby('sentence_id').count()

In [22]:
df['sentence_id'].value_counts()

In [23]:
df['sentence_id'].nunique()

In [24]:
df['client_id'].value_counts().sort_values(ascending=True)

In [25]:
df[['sentence_id', 'client_id']].groupby(by='sentence_id').count().sort_values(by='client_id', ascending=False)


In [26]:
df[['client_id', 'sentence_id']].sort_values(by='client_id', ascending=False)

In [27]:
df[['client_id', 'sentence_id']].sort_values(by='client_id', ascending=False)

In [28]:
df[df['client_id'] == 'ffffc00a5ed69f08a486837064ec2caeff21fe06264c3dd733f633fb6c2ae9aeb561a5a6f43e000554d4b3cc171644ba4ce971177af1cb54f9bd8cc153e71a5c']

In [29]:
# # sprawdzenie czy client_id oraz sequence_id się nie duplikują (czy jeden klient ma rózne sekwencej)
# 
# for client_id in df['client_id'].unique():
#     sentence_count = df['sentence_id'][df['client_id']  == client_id].count()
#     client_id_count = df['client_id'][df['client_id']  == client_id].value_counts().iloc[0]
#     if sentence_count != client_id_count:
#         print(True)


In [30]:
df['sentence_id'][df['client_id']  == 'ffffc00a5ed69f08a486837064ec2caeff21fe06264c3dd733f633fb6c2ae9aeb561a5a6f43e000554d4b3cc171644ba4ce971177af1cb54f9bd8cc153e71a5c'].count()

In [31]:
df['client_id'][df['client_id']  == 'ffffc00a5ed69f08a486837064ec2caeff21fe06264c3dd733f633fb6c2ae9aeb561a5a6f43e000554d4b3cc171644ba4ce971177af1cb54f9bd8cc153e71a5c'].value_counts()

In [32]:
# TODO: WZiąć plik validated.tsv nastepnie podzielić go na mężczyzn i kobiety (dwa osobne dataset) [DONE]. następnie wrzucać do każdego ze zbiorów (najpiew testwoy, walidacyjny, treningowy) client_id od najmniejszej liczby wystąpień w obydwu  dopasować czas trwania każdej próbki (najpierw obcinać od środka, potem dopełniać zerami), zmienić na spektogram i dopiero wtedy dodać (koniec funkcji). druga funkcja będzie jako augumentowanie losowych danych (losowo wybiera funkcję która coś zrobi z danymi) jeżeli moje zbiory kobiet i mężczyzn nie będą spełniały wymaganej ilości próbek którą się poda. Na wyjściu ma być tensor z próbkami spektogramu. To zrobić finalnie dla każdego folderu (języka). Wtedy dopiero dodać label w formie one hot encoding

## Load data:

In [33]:
df = pd.read_csv(os.path.join('languages_audio', 'languages_audio_mp3', 'pl', 'validated.tsv'), sep='\t', usecols=['client_id', 'path', 'sentence_id', 'gender', 'locale'])

## Change mp3 to wav name

In [34]:
df = df.apply(lambda path: path.str.replace('.mp3', '.wav'))

## Split to woman and man sets

In [35]:
df

In [36]:
woman_filter = df['gender'] == 'female_feminine'
man_filter = df['gender'] == 'male_masculine'

In [37]:
df_women = df[woman_filter]
df_women

In [38]:
df_men = df[man_filter]
df_men

## Set number of probes in my set and split it to train, validate and test

In [39]:
SET_SIZE = 50_000
MAX_NUMBER_OF_CLIENT_ID = 3000
MIN_CLIP_DURATION = 4000 #  clip time in [ms]

TRAIN_SIZE = int(SET_SIZE * 0.6)
VAL_SIZE = int((SET_SIZE - TRAIN_SIZE) // 2)
TEST_SIZE = SET_SIZE - TRAIN_SIZE - VAL_SIZE

print(f'Train size: {TRAIN_SIZE}')
print(f'Validation size: {VAL_SIZE}')
print(f'Test size: {TEST_SIZE}')
print(f'Sum of sizes: {TRAIN_SIZE + VAL_SIZE + TEST_SIZE} is equal to SET_SIZE {TRAIN_SIZE + VAL_SIZE + TEST_SIZE == SET_SIZE}')

In [40]:
# Select probes with min that time 
df_clips_duration = pd.read_csv(os.path.join('languages_audio', 'languages_audio_mp3', 'pl', 'clip_durations.tsv'), sep='\t')
df_clips_duration['clip'] = df_clips_duration['clip'].apply(lambda clip: clip.replace('.mp3', '.wav'))
df_men = df_men.merge(df_clips_duration, left_on='path', right_on='clip')
df_men

In [41]:

# 
# for i in range(len(df_clips_duration)):
#     print(i)
#     row_in_df = df_men[df_men['path']== df_clips_duration.iloc[i]['clip']] 
#     if (df_clips_duration.iloc[i]['duration[ms]'] >= MIN_CLIP_DURATION) and (len(row_in_df) >= 0):
#         rows_over_min_dur_time = pd.concat([rows_over_min_dur_time, row_in_df])
#     else:
#         rows_under_min_dur_time = pd.concat([rows_under_min_dur_time, row_in_df])
# 
# 

rows_over_min_dur_time = df_men[df_men['duration[ms]'] >= MIN_CLIP_DURATION]
rows_under_min_dur_time = df_men[df_men['duration[ms]'] < MIN_CLIP_DURATION]
rows_over_min_dur_time

In [42]:
rows_under_min_dur_time

In [43]:
if len(rows_over_min_dur_time) >= SET_SIZE:
    df_men = rows_over_min_dur_time
else:
    df_men = pd.concat([rows_over_min_dur_time, rows_under_min_dur_time], ignore_index=True)[:SET_SIZE]

df_men

In [44]:
df_train = pd.DataFrame()
df_val = pd.DataFrame()
df_test = pd.DataFrame()

In [45]:
df_counted_id = df_men['client_id'].value_counts(ascending=True)
df_counted_id

In [46]:
clients_form_origin_df = df_men[df_men['client_id'] == df_counted_id.index[200]]
clients_form_origin_df1 = df_men[df_men['client_id'] == df_counted_id.index[201]]
clients_form_origin_df1['client_id'].iloc[0]

In [47]:
# df_test = pd.concat([df_test, clients_form_origin_df], ignore_index=True)
# df_test = pd.concat([df_test, clients_form_origin_df1], ignore_index=True)
# df_test

In [48]:
for i in range(len(df_counted_id)):
    print(df_counted_id.index[i], df_counted_id.iloc[i])
    rows_form_origin_df = df_men[df_men['client_id'] == df_counted_id.index[i]][:MAX_NUMBER_OF_CLIENT_ID]
    if len(rows_form_origin_df) <= (TEST_SIZE - len(df_test)) and rows_form_origin_df['client_id'].iloc[0] not in df_test:
        df_test = pd.concat([df_test, rows_form_origin_df], ignore_index=True)
        continue
    if len(rows_form_origin_df) <= (VAL_SIZE - len(df_val)):
        df_val = pd.concat([df_val, rows_form_origin_df], ignore_index=True)
        continue
    if len(rows_form_origin_df) <= (TRAIN_SIZE - len(df_train)):
        df_train = pd.concat([df_train, rows_form_origin_df], ignore_index=True)
        continue
    

In [49]:
df_test

In [50]:
df_val

In [51]:
df_train

In [52]:
df_train['client_id'].value_counts(ascending=False)

In [53]:
df_val['client_id'].value_counts(ascending=False)

In [54]:
df_test['client_id'].value_counts(ascending=False)

## Audio Preprocessing

In [55]:
df_test_filenames = df_test['path']
df_val_filenames = df_val['path']
df_train_filenames = df_train['path']
df_test_filenames

In [56]:
df_val_filenames

In [57]:
PL_DIR = os.path.join('languages_audio', 'languages_audio_wav', 'pl', 'clips')
PL_DIR

In [58]:
# TEST
df_train_filenames = df_train_filenames.apply(lambda fn: os.path.join(PL_DIR, fn))
df_val_filenames = df_val_filenames.apply(lambda fn: os.path.join(PL_DIR, fn))
df_test_filenames = df_test_filenames.apply(lambda fn: os.path.join(PL_DIR, fn))
df_val_filenames

In [59]:
# STARE
pl_dataset_train = tf.data.Dataset.from_tensor_slices(df_train_filenames.apply(lambda fn: os.path.join(PL_DIR, fn)))
pl_dataset_val = tf.data.Dataset.from_tensor_slices(df_val_filenames.apply(lambda fn: os.path.join(PL_DIR, fn)))
pl_dataset_test = tf.data.Dataset.from_tensor_slices(df_test_filenames.apply(lambda fn: os.path.join(PL_DIR, fn)))



In [60]:
# def load_and_align_probes(file_path):
#     wav = load_wav_16k_mono_and_resample(file_path)
#     expected_probes = int((MIN_CLIP_DURATION/1000) * sample_rate)
#     print(expected_probes)
#     current_probes = wav.shape[0]
#     print(current_probes)
#     if expected_probes > current_probes:
#         print("Add zeros")
#         wav = add_zeros(wav, sample_rate)
#     elif expected_probes < current_probes:
#         print("Cut wav")
#         wav = cut_wav(wav, sample_rate)
#     return tf.convert_to_tensor(wav, dtype=tf.float32)

In [61]:
# test_align_tensors = df_test_filenames[:5].apply(lambda filename: load_and_align_probes(filename))
# test_align_tensors

In [62]:
# test_align_tensors.to_list()

In [63]:
# test_align_dataset = tf.data.Dataset.from_tensor_slices(test_align_tensors.to_list())
# test_align_dataset

In [64]:
# test_align_dataset.as_numpy_iterator().next()

In [65]:
# pl_dataset_test.as_numpy_iterator().next()

In [66]:
sample_rate = 48_000

def load_wav_16k_mono_and_resample(filename, fin_sam_rate=16_000):
    file_content = tf.io.read_file(filename)
    wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1)
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int32)
    # wav = librosa.resample(wav.numpy(), orig_sr=sample_rate.numpy(), target_sr=fin_sam_rate)
    # return wav
    return tf.convert_to_tensor(wav, dtype=tf.float32)

In [67]:

clip = load_wav_16k_mono_and_resample(df_train_filenames[0])
print(clip)



In [68]:
plt.plot(clip)

In [69]:
def cut_wav(wav, sample_rate):
    time_probes = wav.shape[0]
    # clip_dur_in_sec = time_probes / sample_rate
    overlap = int((time_probes - (MIN_CLIP_DURATION/1000) * sample_rate)/2) 
    cut_clip = wav[overlap:(time_probes - overlap)]
    return tf.convert_to_tensor(cut_clip, dtype=tf.float32)

In [70]:
cut_clip = cut_wav(clip, sample_rate)
cut_clip

In [71]:
def add_zeros(wav, sample_rate):
    time_probes = wav.shape[0]
    missing_probes_one_side = int((MIN_CLIP_DURATION/1000* sample_rate - time_probes)//2)
    padded_tensor = tf.pad(wav.numpy(), [[missing_probes_one_side, missing_probes_one_side]])
    return tf.convert_to_tensor(padded_tensor, dtype=tf.float32)


In [72]:
add_zeros(wav[:30000], sample_rate)

In [73]:
def align_probes(wav, sample_rate):
    # wav = load_wav_16k_mono_and_resample(file_path)
    expected_probes = int((MIN_CLIP_DURATION/1000) * sample_rate)
    print(expected_probes)
    current_probes = wav.shape[0]
    print(current_probes)
    if expected_probes > current_probes:
        print("Add zeros")
        wav = add_zeros(wav, sample_rate)
    elif expected_probes < current_probes:
        print("Cut wav")
        wav = cut_wav(wav, sample_rate)
    return wav
    

In [74]:
# def align_probes(file_path):
#     wav = load_wav_16k_mono_and_resample(file_path)
#     expected_probes = (MIN_CLIP_DURATION/1000) * sample_rate
#     print(expected_probes)
#     current_probes = wav.shape[0]
#     print(current_probes)
#     if expected_probes > current_probes:
#         print("Add zeros")
#         return add_zeros(wav, sample_rate)
#     if expected_probes < current_probes:
#         print("Cut wav")
#         return cut_wav(wav, sample_rate)
    
    

In [75]:
a = align_probes(cut_clip, sample_rate)
a

In [76]:
plt.plot(a)

In [77]:
MIN_CLIP_DURATION/1000* sample_rate

In [78]:
def increase_amplitude(wav, min_increase=2.0, max_increase=5.0):
    increased_wav = wav * random.uniform(min_increase, max_increase)
    return tf.convert_to_tensor(increased_wav, dtype=tf.float32)

In [79]:
inc_a = increase_amplitude(a)

In [80]:
plt.plot(inc_a)

In [81]:
def normalize_audio(wav):
    max_amplitude = tf.reduce_max(tf.abs(wav))
    normalized_wav = wav / max_amplitude  # Normalizacja do zakresu [-1, 1]
    return normalized_wav

In [82]:
norm_a = normalize_audio(inc_a)
norm_a

In [83]:
plt.plot(norm_a)

In [84]:
def add_noise(wav, noise_level=0.1):
    noise = tf.random.normal(tf.shape(wav), mean=0.0, stddev=noise_level)
    return wav + noise

In [85]:
noise_a = add_noise(inc_a)
noise_a

In [86]:
plt.plot(noise_a)

In [87]:
def time_masking(wav, max_mask_length=10000):
    # Sprawdzenie długości wav
        # Sprawdzenie czy wav jest pusty lub ma nieprawidłowy kształt
    if wav is None or tf.rank(wav) != 1:
        return wav
    
    # Sprawdzenie długości wav
    if tf.shape(wav)[0] <= max_mask_length:
        return wav
    
    # Losowa długość maskowania
    mask_length = tf.random.uniform([], maxval=max_mask_length, dtype=tf.int32)
    
    # Sprawdzenie, czy maska nie wyjdzie poza zakres
    mask_start_max = tf.shape(wav)[0] - mask_length
    mask_start = tf.random.uniform([], maxval=mask_start_max, dtype=tf.int32)
    
    # Stworzenie maski czasowej
    mask = tf.concat([
        tf.ones([mask_start]),
        tf.zeros([mask_length]),
        tf.ones([tf.shape(wav)[0] - mask_start - mask_length])
    ], axis=0)
    
    # Zastosowanie maskowania do sygnału audio
    masked_wav = wav * mask
    
    return masked_wav


In [88]:
masked_a = time_masking(inc_a, max_mask_length=50000)
masked_a

In [89]:
plt.plot(masked_a)

In [90]:
def change_pitch(wav, sample_rate=48000, pitch_shift=2):
    wav_np = wav.numpy()  # Convert tensor to numpy array
    pitched_wav = librosa.effects.pitch_shift(wav_np, sr=sample_rate, n_steps=pitch_shift)
    return tf.convert_to_tensor(pitched_wav, dtype=tf.float32)


In [91]:
pitch_a = change_pitch(inc_a, sample_rate)
pitch_a

In [92]:
plt.plot(pitch_a)

In [93]:
def speed_up_audio(wav, speed_factor=2):
    # Pobierz tablicę numpy z tensora Tensorflow
    wav_np = wav.numpy()
    
    # Zastosuj przyspieszenie do danych audio
    stretched_wav = librosa.effects.time_stretch(wav_np, rate=speed_factor)
    
    return stretched_wav

In [94]:
speed_a = speed_up_audio(inc_a)
speed_a

In [95]:
plt.plot(speed_a)

In [96]:
def slow_down_audio(wav, speed_factor=0.5):
    # Konwertujemy tensor na tablicę numpy
    wav_np = wav.numpy()
    
    # Przeprowadzamy zwolnienie tempa za pomocą librosa
    slowed_wav_np = librosa.effects.time_stretch(wav_np, rate=speed_factor)
    
    # Konwertujemy wynikową tablicę numpy z powrotem na tensor TensorFlow
    return tf.convert_to_tensor(slowed_wav_np, dtype=tf.float32)


In [97]:
slow_a = slow_down_audio(wav)

In [98]:
plt.plot(slow_a)

In [99]:

audio_processing_functions = [increase_amplitude, add_noise, time_masking, change_pitch]
def process_random_samples(dataset, num_samples_to_process):
    processed_samples = []
    for _ in range(num_samples_to_process):
        shuffled_dataset = dataset.shuffle(buffer_size=1000)
        random_sample =  next(iter(shuffled_dataset.take(1)))

        # Dobycie tensora z datasetu
  # Tutaj uzyskujemy tensor danych audio

        processing_function = random.choice(audio_processing_functions)
        processed_sample = processing_function(random_sample)

        processed_samples.append(processed_sample)

    return processed_samples

In [100]:
# Wczytaj, przetwórz i zapisz przetworzone pliki audio z pl_dataset_train
pl_dataset_train_processed = pl_dataset_train.map(lambda filename: load_wav_16k_mono_and_resample(filename))
pl_dataset_val_processed = pl_dataset_val.map(lambda filename:load_wav_16k_mono_and_resample(filename))
pl_dataset_test_processed = pl_dataset_test.map(lambda filename: load_wav_16k_mono_and_resample(filename))
# 
# pl_dataset_train_processed.as_numpy_iterator().next().shape[0]

In [101]:
processed_samples_train = process_random_samples(pl_dataset_train_processed, TRAIN_SIZE - len(pl_dataset_train))
processed_samples_val = process_random_samples(pl_dataset_val_processed, (VAL_SIZE - len(pl_dataset_val)))
processed_samples_test = process_random_samples(pl_dataset_test_processed, (TEST_SIZE - len(pl_dataset_test)))
# processed_samples_test

In [None]:
align_processed_train = [align_probes(tensor, sample_rate) for tensor in processed_samples_train]
align_processed_val = [align_probes(tensor, sample_rate) for tensor in processed_samples_val]
align_processed_test = [align_probes(tensor, sample_rate) for tensor in processed_samples_test]


In [None]:
processed_samples_train_dataset = tf.data.Dataset.from_tensor_slices(align_processed_train)
processed_samples_val_dataset = tf.data.Dataset.from_tensor_slices(align_processed_val)
processed_samples_test_dataset = tf.data.Dataset.from_tensor_slices(align_processed_test)

In [None]:
processed_samples_test_dataset.as_numpy_iterator()

In [None]:
align_pl_dataset_train_with_processed_samples = pl_dataset_train_processed.map(lambda audio: align_probes(audio, sample_rate))
align_pl_dataset_val_with_processed_samples = pl_dataset_val_processed.map(lambda audio: align_probes(audio, sample_rate))
align_pl_dataset_test_with_processed_samples = pl_dataset_test_processed.map(lambda audio: align_probes(audio, sample_rate))



In [None]:
align_pl_dataset_train_with_processed_samples_norm = align_pl_dataset_train_with_processed_samples.map(lambda audio: normalize_audio(audio))
align_pl_dataset_val_with_processed_samples_norm = align_pl_dataset_val_with_processed_samples.map(lambda audio: normalize_audio(audio))
align_pl_dataset_test_with_processed_samples_norm = align_pl_dataset_test_with_processed_samples.map(lambda audio: normalize_audio(audio))

In [None]:
def create_spectrogram(wav):
    spectrogram = tf.signal.stft(wav, frame_length=320, frame_step=32)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.expand_dims(spectrogram, axis=2)
    return spectrogram

In [None]:
spectrogram = create_spectrogram(wav_example)


spectrogram = tf.squeeze(spectrogram, axis=-1)
# spectrogram_in_db = 20 * tf.math.log(spectrogram + 1e-10) / tf.math.log(10.0)
plt.figure(figsize=(10, 4))
plt.imshow(tf.transpose(spectrogram), aspect='auto', origin='lower', cmap='viridis', extent=[0, 10, 0, sample_rate / 2])
plt.colorbar(label='Magnitude')
plt.xlabel('Time [s]')
plt.ylabel('Frequency [Hz]')
plt.title('Spectrogram')
plt.show()

In [None]:
spectrogram = create_spectrogram(pitch_a)
print(spectrogram.shape)

spectrogram = tf.squeeze(spectrogram, axis=-1)
print(spectrogram.shape)
plt.figure(figsize=(10, 4))
plt.imshow(tf.transpose(spectrogram), aspect='auto', origin='lower', cmap='viridis', extent=[0, 10, 0, sample_rate / 2])
plt.colorbar(label='Magnitude')
plt.xlabel('Time [s]')
plt.ylabel('Frequency [Hz]')
plt.title('Spectrogram')
plt.show()

In [None]:
spectrogram_pl_dataset_train = align_pl_dataset_train_with_processed_samples_norm.map(lambda audio: create_spectrogram(audio))
spectrogram_pl_dataset_val = align_pl_dataset_val_with_processed_samples_norm.map(lambda audio: create_spectrogram(audio))
spectrogram_pl_dataset_test = align_pl_dataset_test_with_processed_samples_norm.map(lambda audio: create_spectrogram(audio))