In [1]:
# initialize
import glob
import parselmouth
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# filepath for the dataset
# root = "/Users/leochoo/dev/GP2-dev/SVD"
root = "./testSVD"

my_data_path = root + "/my_data"
healthy_path = root + "/healthy"
patho_path = root + "/pathological"



In [2]:
from tqdm import tqdm
from time import sleep

def get_voice_data(_path):
    # select .wav files only
    wav_files = glob.glob(_path + "/*.wav")

    n_list = []
    tone_list = []
    syllab_list = []

    j_list = []
    s_list = []
    h_list = []

    # for wav_file in wav_files:
    for wav_file in tqdm(wav_files): # tqdm shows the progress bar
        sound = parselmouth.Sound(wav_file) # sound object from wav file
        pitch = sound.to_pitch()
        pulses = parselmouth.praat.call([sound, pitch], "To PointProcess (cc)")

        # name analysis
        name = os.path.basename(wav_file).split(".")[0]  
        
        ## tone
        if "l" in name:
            tone_list.append("l")
        elif "n" in name:
            tone_list.append("n")
        elif "h" in name:
            tone_list.append("h")

        ## syllable
        if "a" in name:
            syllab_list.append("a")
        elif "i" in name:
            syllab_list.append("i")
        elif "u" in name:
            syllab_list.append("u")
        # jitter
        jitter_local = parselmouth.praat.call(pulses, "Get jitter (local)", 0.0, 0.0, 0.0001, 0.02, 1.3) * 100

        # shimmer
        shimmer_local = parselmouth.praat.call([sound, pulses], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)

        # HNR
        harmonicity = parselmouth.praat.call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
        hnr = parselmouth.praat.call(harmonicity, "Get mean", 0, 0)

        # Append to dataframe
        n_list.append(name)
        j_list.append(jitter_local)
        s_list.append(shimmer_local)
        h_list.append(hnr)


    # create dataframe
    df = pd.DataFrame({"Name":pd.Series(n_list),
                        "Type": np.nan,
                        "Tone": pd.Series(tone_list),
                        "Syllab": pd.Series(syllab_list),
                           "Jitter":pd.Series(j_list),
                           "Shimmer":pd.Series(s_list),
                           "HNR":pd.Series(h_list)})
    df["Type"]= _path.split("/")[-1] # identify type: my_data, healthy, functional etc...
    df = df.dropna()
    return df


In [3]:
# get data and show dataframe
my_data_df = get_voice_data(my_data_path)
my_data_df.head()

healthy_df = get_voice_data(healthy_path)
healthy_df.head()

functional_df = get_voice_data(patho_path + "/functional")
functional_df.head()

hyperfunctional_df = get_voice_data(patho_path + "/hyperfunctional")
hyperfunctional_df.head()

organic_df = get_voice_data(patho_path + "/organic")
organic_df.head()

psychogenic_df = get_voice_data(patho_path + "/psychogenic")

100%|██████████| 9/9 [00:01<00:00,  4.56it/s]
100%|██████████| 18/18 [00:01<00:00, 10.35it/s]
100%|██████████| 18/18 [00:01<00:00, 14.20it/s]
100%|██████████| 18/18 [00:01<00:00,  9.72it/s]
100%|██████████| 18/18 [00:02<00:00,  8.91it/s]
100%|██████████| 27/27 [00:02<00:00, 10.69it/s]


In [4]:
# Combine to one dataframe
frames = [my_data_df, healthy_df, functional_df, hyperfunctional_df, organic_df, psychogenic_df]
new_df = pd.concat(frames)

df2 = new_df.loc[ (new_df["Type"].isin(["healthy", "psychogenic"])) & (new_df["Tone"] == "l") & (new_df["Syllab"] == "a")]

df2 = df2.drop(columns=["Name", "Syllab", "Tone"])
print(df2)
df2.to_csv ("./SVM-test.csv", index = False, header=True)

Type    Jitter   Shimmer        HNR
13      healthy  0.500103  0.024036  25.397455
15      healthy  0.191396  0.010283  29.784809
6   psychogenic  0.508194  0.076313  20.207709
19  psychogenic  0.254183  0.030982  27.394054
22  psychogenic  0.100698  0.010602  32.236174


In [5]:
df2

Unnamed: 0,Type,Jitter,Shimmer,HNR
13,healthy,0.500103,0.024036,25.397455
15,healthy,0.191396,0.010283,29.784809
6,psychogenic,0.508194,0.076313,20.207709
19,psychogenic,0.254183,0.030982,27.394054
22,psychogenic,0.100698,0.010602,32.236174


In [6]:
from sklearn import preprocessing

In [7]:
# DataFrame
df = df2
df

Unnamed: 0,Type,Jitter,Shimmer,HNR
13,healthy,0.500103,0.024036,25.397455
15,healthy,0.191396,0.010283,29.784809
6,psychogenic,0.508194,0.076313,20.207709
19,psychogenic,0.254183,0.030982,27.394054
22,psychogenic,0.100698,0.010602,32.236174


In [23]:
# Create a label encoder object 
le = preprocessing.LabelEncoder()
# Fit the encoder object (le) to a pandas field with categorical data
le.fit(df['Type'])
# Apply the label encoder object to a pandas column
dysphonia_target = le.transform(df['Type']) 
print(dysphonia_target)

[0 0 1 1 1]


In [12]:
import sklearn
from sklearn import datasets
from sklearn import svm
from sklearn import metrics

In [13]:
x = cancer.data
y = dysphonia_target

NameError: name 'cancer' is not defined

In [14]:
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.2)

NameError: name 'x' is not defined

In [15]:
print(x_train, y_train)

NameError: name 'x_train' is not defined

In [16]:
classes = ['malignant', 'benign']

In [17]:
clf = svm.SVC(kernel="linear", C=7)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(acc)

NameError: name 'x_train' is not defined

In [18]:
clf = svm.SVC(kernel="poly", C=13)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(acc)

NameError: name 'x_train' is not defined