In [1]:
import pandas as pd
import numpy as np
from scipy.io import wavfile
from scipy.signal import stft
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
df_dev = pd.read_csv('development.csv', index_col=0)
print('The shape of development dataset is ', df_dev.shape)
df_dev.head()

The shape of development dataset is  (9854, 9)


Unnamed: 0_level_0,path,speakerId,action,object,Self-reported fluency level,First Language spoken,Current language used for work/school,gender,ageRange
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/0a312...,2BqVo8kVB2Skwgyb,change language,none,advanced,English (United States),English (United States),female,22-40
1,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/0ee42...,2BqVo8kVB2Skwgyb,activate,music,advanced,English (United States),English (United States),female,22-40
2,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/1d9f3...,2BqVo8kVB2Skwgyb,deactivate,lights,advanced,English (United States),English (United States),female,22-40
3,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/269fc...,2BqVo8kVB2Skwgyb,increase,volume,advanced,English (United States),English (United States),female,22-40
4,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/5bbda...,2BqVo8kVB2Skwgyb,increase,volume,advanced,English (United States),English (United States),female,22-40


In [3]:
df_dev.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9854 entries, 0 to 9853
Data columns (total 9 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   path                                   9854 non-null   object
 1   speakerId                              9854 non-null   object
 2   action                                 9854 non-null   object
 3   object                                 9854 non-null   object
 4   Self-reported fluency level            9854 non-null   object
 5   First Language spoken                  9854 non-null   object
 6   Current language used for work/school  9854 non-null   object
 7   gender                                 9854 non-null   object
 8   ageRange                               9854 non-null   object
dtypes: object(9)
memory usage: 769.8+ KB


In [4]:
df_dev.isna().sum()

path                                     0
speakerId                                0
action                                   0
object                                   0
Self-reported fluency level              0
First Language spoken                    0
Current language used for work/school    0
gender                                   0
ageRange                                 0
dtype: int64

In [5]:
def preprocess(df):
    
    def read_wavfile(path, spilitter):
        '''This is a helper function for preprocess function to
        read content of wav file of which path are provided in the data frame'''
        
        sample_rate, data = wavfile.read( path.split(spilitter)[1] )
        channel = data.ndim
    
        if channel == 1:
            no_sample = data.shape[0]
            length = data.shape[0]/sample_rate
        elif channel == 2:
            no_sample = data.shape[1]
            length = data.shape[1]/sample_rate
        
        return sample_rate, length, no_sample, channel, data

    
    df['class'] = df['action'] + df['object']
    
    df[['sample_rate', 'length', 'no_sample', 'channel', 'data']] = df.apply(lambda r: read_wavfile(r['path'], 'dsl_data/'), axis=1, result_type='expand')
    
    df.drop(['path', 'speakerId', 'action', 'object', 'channel'], axis=1, inplace=True)
    
    df = pd.get_dummies(df, columns=['Self-reported fluency level ', 'First Language spoken', 'Current language used for work/school',
                                     'gender', 'ageRange'], drop_first=True)
    
    return df

In [6]:
df_dev = preprocess(df_dev)

In [7]:
df_dev.head()

Unnamed: 0_level_0,class,sample_rate,length,no_sample,data,Self-reported fluency level _basic,Self-reported fluency level _intermediate,Self-reported fluency level _native,First Language spoken_English (United States),First Language spoken_French (Canada),First Language spoken_Spanish (Venezuela),First Language spoken_Telugu,Current language used for work/school_English (Canada),Current language used for work/school_English (United States),Current language used for work/school_Spanish (Venezuela),gender_male,ageRange_41-65,ageRange_65+
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,change languagenone,16000,1.857625,29722,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1,0,0,0,0,1,0,0,0,0
1,activatemusic,16000,1.393188,22291,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1,0,0,0,0,1,0,0,0,0
2,deactivatelights,16000,1.9505,31208,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1,0,0,0,0,1,0,0,0,0
3,increasevolume,16000,1.764687,28235,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1,0,0,0,0,1,0,0,0,0
4,increasevolume,16000,2.414875,38638,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1,0,0,0,0,1,0,0,0,0


In [10]:
def cal_mean_std_blocks(wav_file_info, n, m):
    
    _, _, zxx = stft(wav_file_info)
    zxx = abs(zxx)
    zxx = zxx.T
    
    means = []
    stds = []
        
    for t_segment in np.array_split(zxx, n):
        for f_segment in np.array_split(t_segment, m):
            means.append(np.mean(f_segment))
            stds.append(np.std(f_segment))
            
    return means, stds

In [11]:
no_time_split = 8
no_freq_split = 8
df_dev[['bls_mean', 'bls_std']] = df_dev.apply(lambda r: cal_mean_std_blocks(r['data'], no_time_split, no_freq_split), axis=1, result_type='expand')

In [12]:
df_dev['bls_mean'].isna().sum()

0

In [13]:
df_dev['bls_std'].isna().sum()

0

In [14]:
mean_cols = [f'mean_block{i}' for i in range(no_time_split*no_freq_split)]
df_dev[mean_cols] = pd.DataFrame(df_dev.bls_mean.tolist(), index=df_dev.index)

In [16]:
std_cols = [f'std_block{i}' for i in range(no_time_split*no_freq_split)]
df_dev[std_cols] = pd.DataFrame(df_dev.bls_std.tolist(), index=df_dev.index)

In [17]:
df_dev.head()

Unnamed: 0_level_0,class,sample_rate,length,no_sample,data,Self-reported fluency level _basic,Self-reported fluency level _intermediate,Self-reported fluency level _native,First Language spoken_English (United States),First Language spoken_French (Canada),...,std_block54,std_block55,std_block56,std_block57,std_block58,std_block59,std_block60,std_block61,std_block62,std_block63
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,change languagenone,16000,1.857625,29722,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1,0,...,3.314595,2.448896,1.225626,0.638054,0.150714,0.077349,0.079379,0.102718,0.136272,0.075969
1,activatemusic,16000,1.393188,22291,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1,0,...,0.255409,0.116773,0.068444,0.090957,0.095419,0.069118,0.088884,0.046469,0.044967,0.02346
2,deactivatelights,16000,1.9505,31208,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1,0,...,4.059565,3.676854,1.760384,0.96167,0.231983,0.332496,0.386195,0.53808,1.325023,0.826567
3,increasevolume,16000,1.764687,28235,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1,0,...,6.821736,6.426651,2.931334,0.209694,0.091654,0.141463,0.100805,0.067995,0.096515,0.128362
4,increasevolume,16000,2.414875,38638,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1,0,...,52.013561,6.094761,1.475085,0.333232,0.84441,0.424691,0.168112,0.051177,0.085487,0.091179


In [19]:
y = df_dev['class']
X = df_dev.drop(['class', 'data', 'bls_mean', 'bls_std'], axis=1)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [25]:
rf = RandomForestClassifier(n_estimators=200).fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.4058853373921867

In [None]:
from sklearn.svm import SVC

svc = SVC().fit(X_train, y_train)
y_pred = svc.predict(X_test)
accuracy_score(y_test, y_pred)

0.2663622526636225