In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
from scipy import signal
from scipy.io import wavfile
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
# import torchaudio.transforms
import librosa
import librosa.display
from skimage.util.shape import view_as_windows
from skimage.util.shape import view_as_blocks
import math
import joblib
from numpy import save, load

from joblib import Parallel, delayed

import time

import warnings
warnings.filterwarnings("ignore")

In [2]:
import torch
from IPython.lib.display import Audio

def plot_waveform(waveform, sample_rate, title):
    num_frames = waveform.shape[0]
    num_channels = 1
    time_axis = torch.arange(0, num_frames) / sample_rate

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].plot(time_axis, waveform, linewidth=1)
        axes[c].grid(True)
        if num_channels > 1:
            axes[c].set_ylabel(f'Channel {c+1}')
    figure.suptitle(title)
    plt.show(block=False)
    print("")

def play_audio(data, sample_rate):
    display(Audio(data, rate=sample_rate, normalize=False))

def plot_spectrogram(spectrogram, sr=16000, n_fft=512):
    librosa.display.specshow(data=spectrogram, sr=sr, n_fft=n_fft, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.show()

# CONSTANT definition

In [3]:
PRE_EMPHASIS = 0.95
TRIM_TOP_DB = 21
N_FTT = 256
N_MFCC = 30

In [4]:
dev_df = pd.read_csv("dsl_data\\development.csv", header=0)
test_dev_df = dev_df[:].copy()

# test_dev_df = dev_df.iloc[randomlist].copy()
# test_dev_df = test_dev_df.reset_index().drop(columns=["index"])

spectrum_list = []

normalization

In [5]:
def normalization(data):
    # data_norm = (data-data.mean())/data.std()

    data_norm = (data-data.min())/(data.max()-data.min())

    return data_norm

silence

In [6]:
def remove_silence(data, trim_top_db_par=55):
    energy = data**2

    # FRONT TRIMMING
    cumulative_energy = np.cumsum(energy)
    _, index = librosa.effects.trim(y=cumulative_energy, top_db=trim_top_db_par)
    front_index = index[0]

    # BACK TRIMMING
    cumulative_energy_reversed = np.abs(cumulative_energy-np.sum(energy))
    _, index = librosa.effects.trim(y=cumulative_energy_reversed, top_db=trim_top_db_par+15)
    back_index = index[1]

    # # Plotting POST trimming
    # plot_waveform(audio_data[front_index:back_index], sr, f"signal AFTER OVERALL trimming")
    # play_audio(audio_data[front_index:back_index], sr)

    # Append sample rate
    return data[front_index:back_index]

mels

In [8]:
def mel_spectrogram_grid_search_PARALLEL(int_quale_silenzio, develop_eval_df, list_of_attributes):
    if "action" in list_of_attributes:
        outliers_removal = True
    else:
        outliers_removal = False
    series_with_mel_spec_list = Parallel(n_jobs=-1, prefer="processes")(
        delayed(single_mel_spec_grid_meow)(series, int_quale_silenzio, outliers_removal)
        for _, series in develop_eval_df[list_of_attributes].iterrows())

    print("mel spec series list is done!")
    return series_with_mel_spec_list


def single_mel_spec_grid_meow(series, int_quale_silenzio, outliers_removal):
    wav_path = series["path"]
    # READING wav file
    data, sample_frequency = librosa.load(wav_path, sr=None)

    # removing silence
    data, _ = librosa.effects.trim(data, top_db=TRIM_TOP_DB)

    # OUTLIERS removal
    if outliers_removal:
        length = data.shape[0]/sample_frequency
        if length > 4 or length < 0.3:
            # return None, None
            return pd.concat([series, pd.Series({"spectrogram": None})]), pd.concat([series, pd.Series({"spectrogram": None})])

    # pre emphasis
    data = librosa.effects.preemphasis(y=data, coef=PRE_EMPHASIS)

    # DATA normalization
    data = normalization(data)

    # Short Term Fourier Transform
    sgram = librosa.stft(data,
                         n_fft=N_FTT,
                         win_length=N_FTT,
                         hop_length=N_FTT//4)
    # Taking MAGNITUDE of stft
    sgram_mag, _ = librosa.magphase(sgram)
    # MEL spectrogram of magnitude
    mel_scale_sgram = librosa.feature.melspectrogram(S=sgram_mag,
                                                     sr=sample_frequency,
                                                     n_mels = 128)
    # ADJUSTING SCALE to dB
    mel_sgram = librosa.power_to_db(mel_scale_sgram, ref=np.min)

    mel_sgram = (mel_sgram-np.mean(mel_sgram, axis=(0, 1)))/np.std(mel_sgram, axis=(0, 1))
    # mel_sgram = (mel_sgram-np.min(mel_sgram, axis=(0, 1)))/(np.min(mel_sgram, axis=(0, 1)) - np.max(mel_sgram, axis=(0, 1)))


    # Data AUGMENTATION
    DATA_AUG = False
    if outliers_removal and DATA_AUG:
        gender = series["gender"]
        if gender == "female":
            n_steps = -1.2
        else:
            n_steps = 1.2
        augmented_data = librosa.effects.pitch_shift(y=data,
                                                    sr=sample_frequency,
                                                    n_steps=n_steps)
        # same for AUGMENTED DATA
        sgram_aug = librosa.stft(augmented_data,
                             n_fft=N_FTT,
                             win_length=N_FTT,
                             hop_length=N_FTT//4)
        # Taking MAGNITUDE of stft
        sgram_mag_aug, _ = librosa.magphase(sgram_aug)
        # MEL spectrogram of magnitude
        mel_scale_sgram_aug = librosa.feature.melspectrogram(S=sgram_mag_aug,
                                                         sr=sample_frequency,
                                                         n_mels = 128)
        # ADJUSTING SCALE to dB
        mel_sgram_aug = librosa.power_to_db(mel_scale_sgram_aug, ref=np.min)
    else:
        mel_sgram_aug = None

    # # PLOTTING spectrogram
    # plot_spectrogram(mel_sgram, sample_frequency)

    return pd.concat([series, pd.Series({"spectrogram": mel_sgram})]), pd.concat([series, pd.Series({"spectrogram": mel_sgram_aug})])



In [9]:
def mfcc_spectrogram_grid_search_PARALLEL(int_quale_silenzio, develop_eval_df, list_of_attributes):
    if "action" in list_of_attributes:
        outliers_removal = True
    else:
        outliers_removal = False
    series_with_mel_spec_list = Parallel(n_jobs=-1, prefer="processes")(
        delayed(single_mfcc_spec_grid_meow)(series, int_quale_silenzio, outliers_removal)
        for _, series in develop_eval_df[list_of_attributes].iterrows())

    print("mel spec series list is done!")
    return series_with_mel_spec_list


def single_mfcc_spec_grid_meow(series, int_quale_silenzio, outliers_removal):
    wav_path = series["path"]
    # READING wav file
    data, sample_frequency = librosa.load(wav_path, sr=None)

    # removing silence
    data, _ = librosa.effects.trim(data, top_db=TRIM_TOP_DB)

    # OUTLIERS removal
    if outliers_removal:
        length = data.shape[0]/sample_frequency
        if length > 4 or length < 0.3:
            # return None, None
            return pd.concat([series, pd.Series({"spectrogram": None})]), pd.concat([series, pd.Series({"spectrogram": None})])

    # pre emphasis
    data = librosa.effects.preemphasis(y=data, coef=PRE_EMPHASIS)

    # DATA normalization
    data = normalization(data)

    # Short Term Fourier Transform
    sgram = librosa.stft(data,
                         n_fft=N_FTT,
                         win_length=N_FTT,
                         hop_length=N_FTT//4)
    # Taking MAGNITUDE of stft
    sgram_mag, _ = librosa.magphase(sgram)
    # MEL spectrogram of magnitude
    mel_scale_sgram = librosa.feature.melspectrogram(S=sgram_mag,
                                                     sr=sample_frequency,
                                                     n_mels = 128)
    # ADJUSTING SCALE to dB
    mel_sgram = librosa.power_to_db(mel_scale_sgram, ref=np.min)


    mfcc = librosa.feature.mfcc(y=data, sr=sample_frequency, S=mel_sgram, n_mfcc=40)

    # Data AUGMENTATION
    DATA_AUG = False
    if outliers_removal and DATA_AUG:
        gender = series["gender"]
        if gender == "female":
            n_steps = -1.2
        else:
            n_steps = 1.2
        augmented_data = librosa.effects.pitch_shift(y=data,
                                                    sr=sample_frequency,
                                                    n_steps=n_steps)
        # same for AUGMENTED DATA
        sgram_aug = librosa.stft(augmented_data,
                             n_fft=N_FTT,
                             win_length=N_FTT,
                             hop_length=N_FTT//4)
        # Taking MAGNITUDE of stft
        sgram_mag_aug, _ = librosa.magphase(sgram_aug)
        # MEL spectrogram of magnitude
        mel_scale_sgram_aug = librosa.feature.melspectrogram(S=sgram_mag_aug,
                                                         sr=sample_frequency,
                                                         n_mels = 128)
        # ADJUSTING SCALE to dB
        mel_sgram_aug = librosa.power_to_db(mel_scale_sgram_aug, ref=np.min)
    else:
        mel_sgram_aug = None

    # # PLOTTING spectrogram
    # plot_spectrogram(mel_sgram, sample_frequency)

    return pd.concat([series, pd.Series({"spectrogram": mel_sgram})]), pd.concat([series, pd.Series({"spectrogram": mfcc})])

    # return pd.concat([series, pd.Series({"spectrogram": mel_sgram})]), pd.concat([series, pd.Series({"spectrogram": mel_sgram_aug})])


characterize

In [10]:
def characterize_spectrum(spectrum_list, n_time, n_freq):
    characterize = []
    for spectrum in spectrum_list:
        f, t = spectrum.shape
        magic_time = t // n_time
        magic_freq = n_freq

        # BLOCKS of fixed size
        # B_w = view_as_windows(spectrum[:, :t-t%n_time], window_shape=(magic_freq, magic_time), step=(n_freq, magic_time//2))
        B_w = view_as_blocks(spectrum[:, :t - t % n_time], block_shape=(magic_freq, magic_time))

        # RESHAPING
        B_w = B_w.reshape(B_w.shape[0]*B_w.shape[1], B_w.shape[2]*B_w.shape[3])

        # CONCATENATING mean and std
        B_characterized = np.concatenate([B_w.mean(axis=1), B_w.std(axis=1)])


        characterize.append(np.concatenate([B_characterized] ))

    return np.array(characterize)


In [11]:
def characterize_spectrum_PARALLEL(series_with_mel_spec_list, n_time, n_freq):
    characterized_list_of_series = Parallel(n_jobs=-1, prefer="processes")(
        delayed(char_spec_Single_BLOCK_view)(series, n_time, n_freq)
        for series in series_with_mel_spec_list)
    return pd.DataFrame(characterized_list_of_series)

def char_spec_Single_BLOCK_view(series, n_time, n_freq):
    spectrum = series["spectrogram"]
    f, t = spectrum.shape
    magic_time = t // n_time
    magic_freq = n_freq

    # BLOCKS of fixed size
    B_w = view_as_blocks(spectrum[:, :t - t % n_time], block_shape=(magic_freq, magic_time))

    # RESHAPING
    B_w = B_w.reshape(B_w.shape[0] * B_w.shape[1], B_w.shape[2], B_w.shape[3])

    # CONCATENATING mean and std
    # B_characterized = np.concatenate([B_w.mean(axis=(1, 2)), B_w.std(axis=(1, 2))])
    B_characterized = np.concatenate([B_w.mean(axis=(1, 2))])

    
    series.drop(labels=["spectrogram"], inplace=True)

    return pd.concat([series, pd.Series(B_characterized)])

def char_spec_Single_WINDOW_view(series, n_time, n_freq):
    spectrum = series["spectrogram"]
    f, t = spectrum.shape
    f_size = n_freq

    t_size = math.ceil(t/n_time)
    while (t_size*n_time - t) % (n_time-1) != 0:
        t_size += 1
    minus_step_time = int((t_size*n_time - t) / (n_time-1))
    minus_step_freq = 0

    window_view = view_as_windows(spectrum, window_shape=(f_size, t_size), step=(f_size-minus_step_freq, t_size-minus_step_time))

    # RESHAPING
    window_view = window_view.reshape(window_view.shape[0]*window_view.shape[1], window_view.shape[2], window_view.shape[3])

    # Calculating various statistic measures on the spectrum.
    mean = np.mean(window_view, axis=(1, 2))
    # std = np.std(window_view, axis=(1,2))
    # median= np.median(window_view,axis=1)
    # skew = stats.skew(window_view, axis=1)
    # kurt = stats.kurtosis(window_view, axis=1)
    # maximum = np.amax(window_view, axis=1)
    # minimum = np.amin(window_view, axis=1)

    # Concatinating all the statistic measures and adding to the feature list.
    # addList = np.concatenate((mean,median,std,skew,kurt,maximum,minimum))
    # addList = np.concatenate((mean,median,std,skew,kurt))
    # addList = np.concatenate((mean,median,std))
    addList = mean

    return pd.concat([series, pd.Series(addList)])

boilerplate

In [12]:
# Rinomina colonne final spectra (colonne della Mean e Std)
def rename_columns(final_spectra_arr, test_dev_df_copy):
    size_of_chacterized_spectra = final_spectra_arr[0].shape[0]
    str_list = [str(x) for x in range(size_of_chacterized_spectra)]
    test_dev_df_copy = pd.concat([test_dev_df_copy, pd.DataFrame(final_spectra_arr, columns=str_list)], axis=1)
    return test_dev_df_copy

def rename_columns_PARALLEL(CHAR_SPEC_dataframe):
    new_col_names = []
    for col in CHAR_SPEC_dataframe.columns:
        if isinstance(col, int):
            new_col_names.append(str(col))
        else:
            new_col_names.append(col)
    CHAR_SPEC_dataframe.columns = new_col_names
    return CHAR_SPEC_dataframe


# Divide Numerical e Categorical attribute
def num_cat_attributes(test_dev_df):
    lista_colonne = list(test_dev_df.columns)
    zero_index = lista_colonne.index('0')

    # WITHOUT LANGUAGES 'Self-reported fluency level ', 'First Language spoken', 'Current language used for work/school',
    categorical_attributes = ['gender', 'ageRange']
    numerical_attributes = lista_colonne[zero_index:]

    return numerical_attributes, categorical_attributes


# PIPELINE definition
def pipeline_definition(numerical_attributes, categorical_attributes, model):
    numeric_transformer = Pipeline(
        steps=[("imputer", SimpleImputer(strategy="median"))]
    )

    categorical_transformer = OneHotEncoder(handle_unknown="ignore")

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numerical_attributes),
            ("cat", categorical_transformer, categorical_attributes),
        ]
    )
    full_pipeline = Pipeline(
        steps=[("preprocessor", preprocessor), ("model", model)]
    )
    return full_pipeline


outliers removal function

In [13]:
def remove_outliers(series_with_mel_spec_list):
    series_with_mel_spec_list = [e for l in series_with_mel_spec_list for e in l]

    # print("before outliers: ", len(series_with_mel_spec_list))
    i=0
    for series_with_mel_spec in series_with_mel_spec_list:
        if series_with_mel_spec["spectrogram"] is None:
            series_with_mel_spec_list.pop(i)
        i+=1

    # print("after outliers: ", len(series_with_mel_spec_list))

    return series_with_mel_spec_list

model selection

In [14]:
from sklearn.svm import SVC

# model = RandomForestClassifier()
# model = KNeighborsClassifier()
model = SVC(cache_size=16000)

model_name = type(model).__name__

In [18]:
if model_name == "KNeighborsClassifier":
    best_par = {'model__n_jobs': -1, 'model__n_neighbors': 9, 'model__p': 1, 'model__weights': 'distance'} ; n_f_best, n_t_best = 8, 25
elif model_name == "SVC":
    best_par = {'model__C': 200, 'model__degree': 4, 'model__kernel': 'rbf', 'model__tol': 0.01} ; n_f_best, n_t_best = 8, 20

Using mel spectrogram

In [19]:

list_of_attributes = ["path", "Id", "gender", "ageRange", "action", "object"]
series_with_mel_spec_list = mel_spectrogram_grid_search_PARALLEL(int_quale_silenzio=2, develop_eval_df=test_dev_df.copy(), list_of_attributes=list_of_attributes)

series_with_mel_spec_list = remove_outliers(series_with_mel_spec_list)

mel spec series list is done!
before outliers:  19708
after outliers:  9823


outlier removeas

In [20]:
from sklearn.utils import compute_sample_weight
from sklearn.preprocessing import StandardScaler

# FINAL SPECTRA
# final_spectra_arr = characterize_spectrum(spectrum_list.copy(), n_time=n_t_best, n_freq=n_f_best)
CHAR_SPEC_dataframe = characterize_spectrum_PARALLEL(series_with_mel_spec_list.copy(), n_time=n_t_best, n_freq=n_f_best)

# rename columns
CHAR_SPEC_dataframe = rename_columns_PARALLEL(CHAR_SPEC_dataframe)
# num cat attributes
numerical_attributes, categorical_attributes = num_cat_attributes(CHAR_SPEC_dataframe)
# pipeline def
full_pipeline = pipeline_definition(numerical_attributes, categorical_attributes, model)

# pipeline def with best paramethers
full_pipeline.set_params(**best_par)

# print(final_spectra_arr.shape)

y = CHAR_SPEC_dataframe['action'] + CHAR_SPEC_dataframe['object']
columns_of_interest = numerical_attributes + categorical_attributes

# sample_weight = compute_sample_weight(class_weight='balanced', y=y)

# # training test split
# X_train, X_test, y_train, y_test = train_test_split(test_dev_df[columns_of_interest], y, test_size=0.2, random_state=42)


# Counfusion Matrix

In [21]:
from sklearn.metrics import confusion_matrix


def plot_confusion_matrix(y_true,y_pred,label_names,clf_name):
    """
    Draws a confusion matrix for the given y_pred and y_true values.
     1. y_true = The actual classifications of the documents in the test dataset.
     2. y_pred = The predicted classifications of the documents from the test dataset.
     3. label_names = The actual names of the classes.
    """
    # Calculate the confusion matrix using the expected and predicted values.
    confusion_mat = confusion_matrix(y_true=np.array(y_true),y_pred=np.array(y_pred),labels=label_names)

    #  Show the confusion matrix values.
    fig = plt.figure(figsize=(12,12))
    plt.imshow(confusion_mat, cmap=plt.cm.Blues, interpolation='nearest')

    # Set the x, y and title labels for the plot.
    plt.xlabel("Expected Outputes", fontsize=10)
    plt.ylabel("Actual Outputs", fontsize=10)
    plt.title(clf_name + " Confusion Matrix",fontsize=12)

    # Arrange the label names on the x and y axis.
    plt.xticks(np.arange(len(label_names)), label_names, rotation='horizontal')
    plt.yticks(np.arange(len(label_names)), label_names)
    plt.tick_params(axis='both', labelsize='10')
    plt.tight_layout()
    for (y, x), label in np.ndenumerate(confusion_mat):
        if label != 0:
            plt.text(x,y,label,ha='center',va='center', size='12')
    # Show the plot
    plt.show()

CONFUSION_MATRIX = False
if CONFUSION_MATRIX:
    # Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(CHAR_SPEC_dataframe[columns_of_interest], y, test_size=0.2, random_state=42)
    # Fit pipeline
    full_pipeline.fit(X_train, y_train)
    # Predict pipeline
    y_pred_test = full_pipeline.predict(X_test)
    # confusion matric
    labels = list(set(CHAR_SPEC_dataframe['action'] + CHAR_SPEC_dataframe['object']))
    plot_confusion_matrix(y_test,y_pred_test,labels, "meow")

    # Compute SCORE since we're already here
    # print(f"boh score is: {full_pipeline.score(X_test, y_test)}")
    print("ACCURACY score of model is: %.3f" % accuracy_score(y_test, y_pred_test))

    # RESET PIPELINE
    full_pipeline = pipeline_definition(numerical_attributes, categorical_attributes, model)
    # pipeline def with best paramethers
    full_pipeline.set_params(**best_par);


# TRAIN DEL MODELLO

In [22]:
full_pipeline.fit(CHAR_SPEC_dataframe[columns_of_interest], y);

# TEST

In [23]:
EVAL = True

In [24]:
# TEST SET
if EVAL:
    eval_df = pd.read_csv("dsl_data\\evaluation.csv", header=0)
id_eval = eval_df["Id"]

spectrum_list_eval = []

# MEL OF TEST

In [25]:
list_of_attributes = ["path", "Id", "gender", "ageRange"]
series_with_mel_spec_list_EVAL = mel_spectrogram_grid_search_PARALLEL(int_quale_silenzio=2, develop_eval_df=eval_df, list_of_attributes=list_of_attributes)

mel spec series list is done!


In [26]:
series_with_mel_spec_list_EVAL = [e for l in series_with_mel_spec_list_EVAL for e in l]
i=0
for series_with_mel_spec in series_with_mel_spec_list_EVAL:
    if series_with_mel_spec["spectrogram"] is None:
        series_with_mel_spec_list_EVAL.pop(i)
    i+=1
i=0
for series_with_mel_spec in series_with_mel_spec_list_EVAL:
    if series_with_mel_spec["spectrogram"] is None:
        print("hi")
        series_with_mel_spec_list_EVAL.pop(i)
    i+=1
i=0
for series_with_mel_spec in series_with_mel_spec_list_EVAL:
    if series_with_mel_spec["spectrogram"] is None:
        print("hi")
        series_with_mel_spec_list_EVAL.pop(i)
    i+=1

In [27]:
# FINAL SPECTRA
# final_spectra_arr = CHAR_SPEC_window_view(spectrum_list_eval, n_time=n_t_best, n_freq=n_f_best)
# final_spectra_arr = characterize_spectrum(spectrum_list_eval.copy(), n_time=n_t_best, n_freq=n_f_best)


CHAR_SPEC_dataframe_EVAL = characterize_spectrum_PARALLEL(series_with_mel_spec_list_EVAL.copy(), n_time=n_t_best, n_freq=n_f_best)
# rename columns
CHAR_SPEC_dataframe_EVAL = rename_columns_PARALLEL(CHAR_SPEC_dataframe_EVAL)
# num cat attributes
numerical_attributes, categorical_attributes = num_cat_attributes(CHAR_SPEC_dataframe_EVAL)


In [28]:
CHAR_SPEC_dataframe_EVAL = CHAR_SPEC_dataframe_EVAL[columns_of_interest]

y_predict = full_pipeline.predict(CHAR_SPEC_dataframe_EVAL)

In [29]:
# PRINT TO FILE

out_df = pd.DataFrame(data=y_predict, columns=["Predicted"])
out_df.index = id_eval
out_df.index.name = "Id"
# df['Id'] = df.index
# print(out_df)
out_df.to_csv(path_or_buf="dsl_data\\prediction.csv", header=True, index=True)