In [3]:
import pandas as pd
import numpy as np
from scipy import signal
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn import svm
import random
from sklearn import preprocessing


In [4]:
data_folder = './data/'

#CSV filenames
deformed01 = 'A_DEFORMED1_C0'
rx01 = 'A_RX1_C0'

df_def = pd.read_csv(data_folder + deformed01 + '.csv', header = None)
df_rx = pd.read_csv(data_folder + rx01 + '.csv', header = None)

In [5]:

#initialisation of clock variables
clock = 3048780.48780488
delay_in_samples = 304878
g_T = 1 / clock
g_N = len(df_def)

print("the frequency of the signal is", clock * 1e-6, "MHz")
print("the total duration of the signal is", g_N / clock, "seconds")
print("number of samples :", len(df_def))


the frequency of the signal is 3.04878048780488 MHz
the total duration of the signal is 10.495999999999993 seconds
number of samples : 32000000


In [6]:
def clean_data(df):
    """Removes the silence at the end of the audio

    Args:
        df (array[][]): the data from the csv

    Returns:
        array[][]: the data without the silence at the end
    """
    epsilon =  1e-1
    #remove the silence at the end: 
    for i in range(len(df) - 1, 0, -1):
        if(df[0][i] > epsilon):
            print("deleted", (len(df) - i) / clock, "samples from the end because we assume it is silence from", len(df) / clock, "samples")
            return df[:i]

def split_csv(filename):
    """Function that divides the data set in samples of 126ms 

    Args:
        filename (csv): The csv containing the data

    Returns:
        array[][]: the signal split in samples of 126ms
    """
    df = pd.read_csv(data_folder + filename + '.csv', header = None)
    df = clean_data(df)
    df.rename(columns = {0:'amplitude'}, inplace = True)
    point_per_sample = int(160e-3 * clock)
    
    nb_of_samples = int(len(df)/point_per_sample)
    data = df[:nb_of_samples * point_per_sample]
    return np.array_split(data, nb_of_samples)



In [7]:


def wavelet_transform(data,widths):  
    """Computes the Continuous Wavelet Transform of the signal

    Args:
        data (2 dimensional array): The signal on which we want to perform the Continuous Wavelet Transform
        widths ([type]): [description]

    Returns:
        (M,) sequence: The width used for the transform
    """
    cwt = []
    for i in data:
        cwt.append(np.transpose(signal.cwt(i['amplitude'], signal.ricker, widths)))

    return cwt



In [8]:
def flatten_data(cwt_def,cwt_rx):
    """ Aggregates the slices of 126ms into one signal

    Args:
        cwt_def (3-dim array): array we want to flatten to 2-dim
        cwt_rx (3-dim array): array we want to flatten to 2-dim
    Returns:
        (array): 2-dim array
        (array): 2-dim array
    """
    deformed = []
    for sublist in cwt_def:
        for item in sublist:
            deformed.append(item)

    rx = []
    for sublist in cwt_rx:
        for item in sublist:
            rx.append(item)
    return deformed,rx


In [9]:

def generate_test_train(deformed_data,rx_data):
    """Generates the train and test sets by selecting random portions of the signal with no overlap
        at a 70-30% ratio

    Args:
        train_len (int): length of the train set
        test_len (int): length of the test set

    Returns:
       (array,array): train set and test set and corresponding targets
    """
    # Threshold at which the svm runs in an acceptable time
    max_tresh = 70000
    train_len= max_tresh//2
    test_len = int(((train_len*.3)/.7)//2)
    rand_train = random.randrange(0,len(rx))
    rand_test =  random.randrange(0,len(rx))
    # The test and train set do not overlap
    while (rand_test in range(rand_train, rand_train + train_len)):
         rand_test =  random.randrange(0,len(rx), train_len)
    return  np.concatenate((deformed_data[rand_train:rand_train + train_len],rx_data[rand_train:rand_train + train_len])) ,np.concatenate((deformed_data[rand_test:rand_test+test_len],rx_data[rand_test:rand_test+test_len])), np.concatenate((np.ones(train_len),np.zeros(train_len))),np.concatenate((np.ones(test_len),np.zeros(test_len)))


In [10]:
# Standarize the data before feeding it to the PCA algorithm
def standardize(train_set, test_set):
    """Standarize the data before feeding it to the PCA algorithm

    Args:
        train_set (array like (n-samples, n_features): The train set
        test_set (array like (n-samples, n_features)): The test set

    Returns:
       (array (n-samples, n_features)): the standardized train set  
       (array (n-samples, n_features)): the standardized test set
    """
    scaler = StandardScaler()
    # Fit on training set only
    scaler.fit(train_set)
    # Apply transform to both the training set and the test set
    return scaler.transform(train_set), scaler.transform(test_set)


In [11]:
def feature_extraction(train,test):
    """PCA for feature extraction on the standardized data

    Args:
        train (array (n-samples, n_features)): train set we want to extract features from
        test (array (n-samples, n_features)): test set we want to extract features from

    Returns:
        (array (n-samples, n_features)): dimension-reduced train set 
        (array (n-samples, n_features)): dimension-reduced test-set 
    """
    #We chose the minimal number of prinicpal component such that 95% of the variance is retained
    pca = PCA(.95)
    #fit the data onto the vectors computed by the algorithm
    pca.fit(train)
    print(pca.n_components_ ,"components were used to capture the data instead of", train.shape[1])
    #does the projection
    return pca.transform(train),pca.transform(test)


In [12]:

def runSVM(train,test,target,kernel):
    """SVM 

    Args:
        train ((array (n-samples, n_features)): train set
        test ((array (n-samples, n_features)): test set
        target ((array (n-samples,)): target
        kernel (string): kernel to use for the SVM

    Returns:
        (array (n-samples,): prediction given by the SVM
    """
    #Create a svm Classifier
    clf = svm.SVC(kernel=kernel)
    #Train the model using the training sets
    clf.fit(train, target)
    #Predict the response for test dataset
    return clf.predict(test)


In [13]:

def getStatistics(y_test, y_pred):
    """Function to compute accuracy,recall,f1-score and more

    Args:
        y_test ((array (n-samples,)): test set target
        y_pred ((array (n-samples,)): classification given by the model
    """
    # Model Accuracy: how often is the classifier correct?
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print("Precision:",metrics.precision_score(y_test, y_pred))
    # Model Recall: what percentage of positive tuples are labelled as such?
    print("Recall:", metrics.recall_score(y_test, y_pred))
    print("Confusion matrix", confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

In [14]:
#Get the data split in 126 ms slices
data_deformed = split_csv(deformed01)
data_rx = split_csv(rx01)
#Compute the wavelet transform of each slice
cwt_def = wavelet_transform(data_deformed,np.arange(1,30))
cwt_rx = wavelet_transform(data_rx,np.arange(1,30))
#Agregate the slices into one signal
deformed, rx = flatten_data(cwt_def,cwt_rx)
#Compute train and test set
X_train, X_test,y_train,y_test = generate_test_train(deformed,rx)
#Standardize the data
stand_train,stand_test = standardize(X_train,X_test)
# Perform dimensionality reduction on the data
pca_train,pca_test= feature_extraction(stand_test,stand_train)
#Run the SVM
prediction = runSVM(stand_train,stand_test,y_train,'rbf')
getStatistics(y_test,prediction)


deleted 1.3556525359999991 samples from the end because we assume it is silence from 10.495999999999993 samples
deleted 1.2826049679999991 samples from the end because we assume it is silence from 10.495999999999993 samples
4 components were used to capture the data instead of 29
Accuracy: 0.5633333333333334
Precision: 0.5616242864556306
Recall: 0.5772
Confusion matrix [[4121 3379]
 [3171 4329]]
              precision    recall  f1-score   support

         0.0       0.57      0.55      0.56      7500
         1.0       0.56      0.58      0.57      7500

    accuracy                           0.56     15000
   macro avg       0.56      0.56      0.56     15000
weighted avg       0.56      0.56      0.56     15000



Running the SVM with the neural net features

In [16]:
def get_Librosa_features(filename):
    """Gets the data and features generated by Librosa, a python package for music and audio analysis and standardizes it
       The features have been generated in the SequentialNN.ipynb file
    Args:
        filename (string): file name where the data is contained

    Returns:
        array: standardized data
    """
    data = pd.read_csv(filename)
    data = data.drop(['filename'],axis=1)
    labels = data.iloc[:, -1]
    encoder = preprocessing.LabelEncoder()
    scaler = StandardScaler()
    return scaler.fit_transform(np.array(data.iloc[:, :-1], dtype = float)),encoder.fit_transform(labels)


def train_test_set(data):
    """Splits the data into 70-30% train-test sets

    Args:
        data (array (n-samples, n_features)): data with the features from

    Returns:
        [array]: the train and test set and thei corresponding targets
    """
    train_len = int((len(data)//2)*0.7)
    test_len = (len(data) //2 )- train_len
    mid = len(data)//2
   

    X_train = np.concatenate((data[:train_len],data[mid:mid+train_len]))
    X_test = np.concatenate((data[train_len:mid],data[mid+train_len:]))
    y_train=np.concatenate((np.ones(train_len),np.zeros(train_len)))
    y_test=np.concatenate((np.ones(test_len),np.zeros(test_len)))
    return X_train,X_test,y_train,y_test


X,y = get_Librosa_features("data.csv")
x_train,x_test,y_train,y_test= train_test_set(X)
pred_nn = runSVM(x_train,x_test,y_train,"rbf")
getStatistics(pred_nn,y_test)


Accuracy: 0.9903846153846154
Precision: 0.9807692307692307
Recall: 1.0
Confusion matrix [[52  1]
 [ 0 51]]
              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99        53
         1.0       0.98      1.00      0.99        51

    accuracy                           0.99       104
   macro avg       0.99      0.99      0.99       104
weighted avg       0.99      0.99      0.99       104

