# Spoken language detection

## 1. 

In [72]:
import os
import random
import numpy as np
import tensorflow as tf
import librosa
import matplotlib.pyplot as plt
import pandas as pd
import IPython

In [85]:
PL_DIR = os.path.join('languages_audio', 'languages_audio_wav', 'pl', 'clips')
PL_DIR

In [86]:
# create tensorflow dataset
pl_dataset = tf.data.Dataset.list_files(PL_DIR + '\*.wav')
pl_dataset

In [87]:
pl_dataset.as_numpy_iterator().next()

In [88]:
pl_dataset.as_numpy_iterator().next()

In [89]:
df = pd.read_csv(os.path.join('languages_audio', 'languages_audio_mp3', 'pl', 'validated.tsv'), sep='\t')
df

In [90]:
df[df['gender'] == 'female_feminine']

In [91]:
df[df['gender'] == 'male_masculine']

In [92]:
df.groupby('sentence_id').count()

In [93]:
df['sentence_id'].value_counts()

In [94]:
df['sentence_id'].nunique()

In [95]:
df['client_id'].value_counts().sort_values(ascending=True)

In [96]:
df[['sentence_id', 'client_id']].groupby(by='sentence_id').count().sort_values(by='client_id', ascending=False)


In [97]:
df[['client_id', 'sentence_id']].sort_values(by='client_id', ascending=False)

In [98]:
df[['client_id', 'sentence_id']].sort_values(by='client_id', ascending=False)

In [99]:
df[df['client_id'] == 'ffffc00a5ed69f08a486837064ec2caeff21fe06264c3dd733f633fb6c2ae9aeb561a5a6f43e000554d4b3cc171644ba4ce971177af1cb54f9bd8cc153e71a5c']

In [100]:
# # sprawdzenie czy client_id oraz sequence_id się nie duplikują (czy jeden klient ma rózne sekwencej)
# 
# for client_id in df['client_id'].unique():
#     sentence_count = df['sentence_id'][df['client_id']  == client_id].count()
#     client_id_count = df['client_id'][df['client_id']  == client_id].value_counts().iloc[0]
#     if sentence_count != client_id_count:
#         print(True)


In [101]:
df['sentence_id'][df['client_id']  == 'ffffc00a5ed69f08a486837064ec2caeff21fe06264c3dd733f633fb6c2ae9aeb561a5a6f43e000554d4b3cc171644ba4ce971177af1cb54f9bd8cc153e71a5c'].count()

In [102]:
df['client_id'][df['client_id']  == 'ffffc00a5ed69f08a486837064ec2caeff21fe06264c3dd733f633fb6c2ae9aeb561a5a6f43e000554d4b3cc171644ba4ce971177af1cb54f9bd8cc153e71a5c'].value_counts()

In [103]:
# TODO: WZiąć plik validated.tsv nastepnie podzielić go na mężczyzn i kobiety (dwa osobne dataset). następnie wrzucać do każdego ze zbiorów (najpiew testwoy, walidacyjny, treningowy) client_id od najmniejszej liczby wystąpień w obydwu  dopasować czas trwania każdej próbki (najpierw obcinać od środka, potem dopełniać zerami), zmienić na spektogram i dopiero wtedy dodać (koniec funkcji). druga funkcja będzie jako augumentowanie losowych danych (losowo wybiera funkcję która coś zrobi z danymi) jeżeli moje zbiory kobiet i mężczyzn nie będą spełniały wymaganej ilości próbek którą się poda. Na wyjściu ma być tensor z próbkami spektogramu. To zrobić finalnie dla każdego folderu (języka). Wtedy dopiero dodać label w formie one hot encoding

## Load data:

In [104]:
df = pd.read_csv(os.path.join('languages_audio', 'languages_audio_mp3', 'pl', 'validated.tsv'), sep='\t', usecols=['client_id', 'path', 'sentence_id', 'gender', 'locale'])

## Change mp3 to wav name

In [105]:
df = df.apply(lambda path: path.str.replace('.mp3', '.wav'))

## Split to woman and man sets

In [106]:
df

In [107]:
woman_filter = df['gender'] == 'female_feminine'
man_filter = df['gender'] == 'male_masculine'

In [108]:
df_women = df[woman_filter]
df_women

In [109]:
df_men = df[man_filter]
df_men

## Set number of probes in my set and split it to train, validate and test

In [110]:
SET_SIZE = 1_000
MAX_NUMBER_OF_CLIENT_ID = 3000
MIN_CLIP_DURATION = 4000 #  clip time in [ms]

TRAIN_SIZE = int(SET_SIZE * 0.6)
VAL_SIZE = int((SET_SIZE - TRAIN_SIZE) // 2)
TEST_SIZE = SET_SIZE - TRAIN_SIZE - VAL_SIZE

print(f'Train size: {TRAIN_SIZE}')
print(f'Validation size: {VAL_SIZE}')
print(f'Test size: {TEST_SIZE}')
print(f'Sum of sizes: {TRAIN_SIZE + VAL_SIZE + TEST_SIZE} is equal to SET_SIZE {TRAIN_SIZE + VAL_SIZE + TEST_SIZE == SET_SIZE}')

In [111]:
# Select probes with min that time 
df_clips_duration = pd.read_csv(os.path.join('languages_audio', 'languages_audio_mp3', 'pl', 'clip_durations.tsv'), sep='\t')
df_clips_duration['clip'] = df_clips_duration['clip'].apply(lambda clip: clip.replace('.mp3', '.wav'))
df_men = df_men.merge(df_clips_duration, left_on='path', right_on='clip')
df_men

In [112]:

# 
# for i in range(len(df_clips_duration)):
#     print(i)
#     row_in_df = df_men[df_men['path']== df_clips_duration.iloc[i]['clip']] 
#     if (df_clips_duration.iloc[i]['duration[ms]'] >= MIN_CLIP_DURATION) and (len(row_in_df) >= 0):
#         rows_over_min_dur_time = pd.concat([rows_over_min_dur_time, row_in_df])
#     else:
#         rows_under_min_dur_time = pd.concat([rows_under_min_dur_time, row_in_df])
# 
# 

rows_over_min_dur_time = df_men[df_men['duration[ms]'] >= MIN_CLIP_DURATION]
rows_under_min_dur_time = df_men[df_men['duration[ms]'] < MIN_CLIP_DURATION]
rows_over_min_dur_time

In [113]:
rows_under_min_dur_time

In [114]:
if len(rows_over_min_dur_time) >= SET_SIZE:
    df_men = rows_over_min_dur_time
else:
    df_men = pd.concat([rows_over_min_dur_time, rows_under_min_dur_time], ignore_index=True)[:SET_SIZE]

df_men

In [115]:
df_train = pd.DataFrame()
df_val = pd.DataFrame()
df_test = pd.DataFrame()

In [116]:
df_counted_id = df_men['client_id'].value_counts(ascending=True)
df_counted_id

In [117]:
clients_form_origin_df = df_men[df_men['client_id'] == df_counted_id.index[200]]
clients_form_origin_df1 = df_men[df_men['client_id'] == df_counted_id.index[201]]
clients_form_origin_df1['client_id'].iloc[0]

In [119]:
for i in range(len(df_counted_id)):
    print(df_counted_id.index[i], df_counted_id.iloc[i])
    rows_form_origin_df = df_men[df_men['client_id'] == df_counted_id.index[i]][:MAX_NUMBER_OF_CLIENT_ID]
    if len(rows_form_origin_df) <= (TEST_SIZE - len(df_test)) and rows_form_origin_df['client_id'].iloc[0] not in df_test:
        df_test = pd.concat([df_test, rows_form_origin_df], ignore_index=True)
        continue
    if len(rows_form_origin_df) <= (VAL_SIZE - len(df_val)):
        df_val = pd.concat([df_val, rows_form_origin_df], ignore_index=True)
        continue
    if len(rows_form_origin_df) <= (TRAIN_SIZE - len(df_train)):
        df_train = pd.concat([df_train, rows_form_origin_df], ignore_index=True)
        continue
    

In [120]:
df_test

In [121]:
df_val

In [122]:
df_train

In [123]:
df_train['client_id'].value_counts(ascending=False)

In [124]:
df_val['client_id'].value_counts(ascending=False)

In [125]:
df_test['client_id'].value_counts(ascending=False)

## Audio Preprocessing

In [126]:
df_test_filenames = df_test['path']
df_val_filenames = df_val['path']
df_train_filenames = df_train['path']
df_test_filenames

In [127]:
df_val_filenames

In [128]:
PL_DIR = os.path.join('languages_audio', 'languages_audio_wav', 'pl', 'clips')
PL_DIR

In [129]:
# TEST
df_train_filenames = df_train_filenames.apply(lambda fn: os.path.join(PL_DIR, fn))
df_val_filenames = df_val_filenames.apply(lambda fn: os.path.join(PL_DIR, fn))
df_test_filenames = df_test_filenames.apply(lambda fn: os.path.join(PL_DIR, fn))
df_val_filenames

In [130]:
# STARE
# pl_dataset_train = tf.data.Dataset.from_tensor_slices(df_train_filenames.apply(lambda fn: os.path.join(PL_DIR, fn)))
# pl_dataset_val = tf.data.Dataset.from_tensor_slices(df_val_filenames.apply(lambda fn: os.path.join(PL_DIR, fn)))
# pl_dataset_test = tf.data.Dataset.from_tensor_slices(df_test_filenames.apply(lambda fn: os.path.join(PL_DIR, fn)))



In [146]:
sample_rate = 48_000

def add_zeros(wav, sample_rate):
    time_probes = wav.shape[0]
    missing_probes_one_side = int((MIN_CLIP_DURATION/1000* sample_rate - time_probes)//2)
    padded_tensor = tf.pad(wav.numpy(), [[missing_probes_one_side, missing_probes_one_side]])
    return tf.convert_to_tensor(padded_tensor, dtype=tf.float32)

def cut_wav(wav, sample_rate):
    time_probes = wav.shape[0]
    # clip_dur_in_sec = time_probes / sample_rate
    overlap = int((time_probes - (MIN_CLIP_DURATION/1000) * sample_rate)/2) 
    cut_clip = wav[overlap:(time_probes - overlap)]
    return tf.convert_to_tensor(cut_clip, dtype=tf.float32)

def load_wav_16k_mono_and_resample(filename, fin_sam_rate=16_000):
    file_content = tf.io.read_file(filename)
    wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1)
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int32)
    # wav = librosa.resample(wav.numpy(), orig_sr=sample_rate.numpy(), target_sr=fin_sam_rate)
    # return wav
    # return tf.convert_to_tensor(wav, dtype=tf.float32), sample_rate.numpy()
    return tf.convert_to_tensor(wav, dtype=tf.float32)

def load_and_align_probes(file_path):
    wav = load_wav_16k_mono_and_resample(file_path)
    expected_probes = int((MIN_CLIP_DURATION/1000) * sample_rate)
    print(expected_probes)
    current_probes = wav.shape[0]
    print(current_probes)
    if expected_probes > current_probes:
        print("Add zeros")
        return add_zeros(wav, sample_rate)
    elif expected_probes < current_probes:
        print("Cut wav")
        return cut_wav(wav, sample_rate)
    return tf.convert_to_tensor(wav, dtype=tf.float32)



In [147]:
train_align_tensors = df_train_filenames.apply(lambda filename: load_and_align_probes(filename))
val_align_tensors = df_test_filenames.apply(lambda filename: load_and_align_probes(filename))
test_align_tensors = df_val_filenames.apply(lambda filename: load_and_align_probes(filename))

type(test_align_tensors)

In [148]:
test_align_tensors.to_list()

In [149]:
train_align_dataset = tf.data.Dataset.from_tensor_slices(train_align_tensors.to_list())
test_align_dataset = tf.data.Dataset.from_tensor_slices(val_align_tensors.to_list())
val_align_dataset = tf.data.Dataset.from_tensor_slices(test_align_tensors.to_list())

test_align_dataset

In [261]:
train_align_dataset.as_numpy_iterator().next()

In [196]:
def align_probes(wav, sample_rate):
    # wav = load_wav_16k_mono_and_resample(file_path)
    expected_probes = (MIN_CLIP_DURATION/1000) * sample_rate
    print(expected_probes)
    current_probes = wav.shape[0]
    print(current_probes)
    if expected_probes > current_probes:
        print("Add zeros")
        return add_zeros(wav, sample_rate)
    if expected_probes < current_probes:
        print("Cut wav")
        return cut_wav(wav, sample_rate)
    return wav
    

In [199]:
MIN_CLIP_DURATION/1000* sample_rate

In [200]:
def increase_amplitude(wav, min_increase=2.0, max_increase=5.0):
    increased_wav = wav * random.uniform(min_increase, max_increase)
    return tf.convert_to_tensor(increased_wav, dtype=tf.float32)

In [203]:
def normalize_audio(wav):
    max_amplitude = tf.reduce_max(tf.abs(wav))
    normalized_wav = wav / max_amplitude  # Normalizacja do zakresu [-1, 1]
    return normalized_wav

In [206]:
def add_noise(wav, noise_level=0.1):
    noise = tf.random.normal(tf.shape(wav), mean=0.0, stddev=noise_level)
    return wav + noise

In [209]:
def time_masking(wav, max_mask_length=10000):
    # Sprawdzenie długości wav
        # Sprawdzenie czy wav jest pusty lub ma nieprawidłowy kształt
    if wav is None or tf.rank(wav) != 1:
        return wav
    
    # Sprawdzenie długości wav
    if tf.shape(wav)[0] <= max_mask_length:
        return wav
    
    # Losowa długość maskowania
    mask_length = tf.random.uniform([], maxval=max_mask_length, dtype=tf.int32)
    
    # Sprawdzenie, czy maska nie wyjdzie poza zakres
    mask_start_max = tf.shape(wav)[0] - mask_length
    mask_start = tf.random.uniform([], maxval=mask_start_max, dtype=tf.int32)
    
    # Stworzenie maski czasowej
    mask = tf.concat([
        tf.ones([mask_start]),
        tf.zeros([mask_length]),
        tf.ones([tf.shape(wav)[0] - mask_start - mask_length])
    ], axis=0)
    
    # Zastosowanie maskowania do sygnału audio
    masked_wav = wav * mask
    
    return masked_wav


In [212]:
def change_pitch(wav, sample_rate=48000, pitch_shift=2):
    wav_np = wav.numpy()  # Convert tensor to numpy array
    pitched_wav = librosa.effects.pitch_shift(wav_np, sr=sample_rate, n_steps=pitch_shift)
    return tf.convert_to_tensor(pitched_wav, dtype=tf.float32)


In [215]:
def speed_up_audio(wav, speed_factor=2):
    wav_np = wav.numpy()
    stretched_wav = librosa.effects.time_stretch(wav_np, rate=speed_factor)   
    return stretched_wav

In [218]:
def slow_down_audio(wav, speed_factor=0.5):
    wav_np = wav.numpy()
    slowed_wav_np = librosa.effects.time_stretch(wav_np, rate=speed_factor)
    return tf.convert_to_tensor(slowed_wav_np, dtype=tf.float32)


In [221]:

audio_processing_functions = [increase_amplitude, add_noise, time_masking, change_pitch]
def process_random_samples(dataset, num_samples_to_process):
    processed_samples = []
    for _ in range(num_samples_to_process):
        shuffled_dataset = dataset.shuffle(buffer_size=1000)
        random_sample =  next(iter(shuffled_dataset.take(1)))

        processing_function = random.choice(audio_processing_functions)
        processed_sample = processing_function(random_sample)

        processed_samples.append(processed_sample)

    return processed_samples

In [223]:
processed_samples_train = process_random_samples(train_align_dataset, TRAIN_SIZE - len(train_align_dataset))
processed_samples_val = process_random_samples(val_align_dataset, (VAL_SIZE - len(val_align_dataset)))
processed_samples_test = process_random_samples(test_align_dataset, (TEST_SIZE - len(test_align_dataset)))
processed_samples_test

In [224]:
align_processed_train = [align_probes(tensor, sample_rate) for tensor in processed_samples_train]
align_processed_val = [align_probes(tensor, sample_rate) for tensor in processed_samples_val]
align_processed_test = [align_probes(tensor,sample_rate) for tensor in processed_samples_test]
align_processed_train

In [225]:
processed_samples_train_dataset = tf.data.Dataset.from_tensor_slices(align_processed_train)
processed_samples_val_dataset = tf.data.Dataset.from_tensor_slices(align_processed_val)
processed_samples_test_dataset = tf.data.Dataset.from_tensor_slices(align_processed_test)

In [226]:
processed_samples_test_dataset.as_numpy_iterator()

In [227]:

train_dataset_con = train_align_dataset.concatenate(processed_samples_train_dataset)
val_dataset_con = val_align_dataset.concatenate(processed_samples_val_dataset)
test_dataset_con = test_align_dataset.concatenate(processed_samples_test_dataset)


In [228]:
align_pl_dataset_train_with_processed_samples_norm = train_dataset_con.map(lambda audio: normalize_audio(audio))
align_pl_dataset_val_with_processed_samples_norm = val_dataset_con.map(lambda audio: normalize_audio(audio))
align_pl_dataset_test_with_processed_samples_norm = test_dataset_con.map(lambda audio: normalize_audio(audio))

In [229]:
def create_spectrogram(wav):
    spectrogram = tf.signal.stft(wav, frame_length=320, frame_step=32)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.expand_dims(spectrogram, axis=2)
    return spectrogram

In [232]:
spectrogram_pl_dataset_train = align_pl_dataset_train_with_processed_samples_norm.map(lambda audio: create_spectrogram(audio))
spectrogram_pl_dataset_val = align_pl_dataset_val_with_processed_samples_norm.map(lambda audio: create_spectrogram(audio))
spectrogram_pl_dataset_test = align_pl_dataset_test_with_processed_samples_norm.map(lambda audio: create_spectrogram(audio))

In [233]:
spectrogram_pl_dataset_train

In [238]:
spectrogram_pl_dataset_val

In [235]:
spectrogram_pl_dataset_test

In [243]:
spectrogram_pl_dataset_test.as_numpy_iterator().next().shape

In [257]:
labeled_train_dataset = tf.data.Dataset.zip((spectrogram_pl_dataset_train, tf.data.Dataset.from_tensor_slices(tf.ones(len(spectrogram_pl_dataset_train)))))
labeled_val_dataset = tf.data.Dataset.zip((spectrogram_pl_dataset_val, tf.data.Dataset.from_tensor_slices(tf.ones(len(spectrogram_pl_dataset_val)))))
labeled_test_dataset = tf.data.Dataset.zip((spectrogram_pl_dataset_test, tf.data.Dataset.from_tensor_slices(tf.ones(len(spectrogram_pl_dataset_test)))))

In [258]:
labeled_train_dataset.as_numpy_iterator().next()

In [259]:
labeled_val_dataset.as_numpy_iterator().next()

In [260]:
labeled_test_dataset.as_numpy_iterator().next()