In [352]:
import pandas as pd
import numpy as np
from scipy.io import wavfile
from scipy.signal import stft
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
df_dev = pd.read_csv('development.csv', index_col=0)
print('The shape of development dataset is ', df_dev.shape)
df_dev.head()

The shape of development dataset is  (9854, 9)


Unnamed: 0_level_0,path,speakerId,action,object,Self-reported fluency level,First Language spoken,Current language used for work/school,gender,ageRange
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/0a312...,2BqVo8kVB2Skwgyb,change language,none,advanced,English (United States),English (United States),female,22-40
1,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/0ee42...,2BqVo8kVB2Skwgyb,activate,music,advanced,English (United States),English (United States),female,22-40
2,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/1d9f3...,2BqVo8kVB2Skwgyb,deactivate,lights,advanced,English (United States),English (United States),female,22-40
3,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/269fc...,2BqVo8kVB2Skwgyb,increase,volume,advanced,English (United States),English (United States),female,22-40
4,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/5bbda...,2BqVo8kVB2Skwgyb,increase,volume,advanced,English (United States),English (United States),female,22-40


In [3]:
df_dev.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9854 entries, 0 to 9853
Data columns (total 9 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   path                                   9854 non-null   object
 1   speakerId                              9854 non-null   object
 2   action                                 9854 non-null   object
 3   object                                 9854 non-null   object
 4   Self-reported fluency level            9854 non-null   object
 5   First Language spoken                  9854 non-null   object
 6   Current language used for work/school  9854 non-null   object
 7   gender                                 9854 non-null   object
 8   ageRange                               9854 non-null   object
dtypes: object(9)
memory usage: 769.8+ KB


In [4]:
df_dev.isna().sum()

path                                     0
speakerId                                0
action                                   0
object                                   0
Self-reported fluency level              0
First Language spoken                    0
Current language used for work/school    0
gender                                   0
ageRange                                 0
dtype: int64

In [5]:
def read_wavfile(path, spilitter):
    
    sample_rate, data = wavfile.read( path.split(spilitter)[1] )
    channel = data.ndim
    
    if channel == 1:
        no_sample = data.shape[0]
        length = data.shape[0]/sample_rate
    elif channel == 2:
        no_sample = data.shape[1]
        length = data.shape[1]/sample_rate
        
    return sample_rate, length, no_sample, channel, data

In [6]:
df_dev[['sample_rate', 'length', 'no_sample', 'channel', 'data']] = df_dev.apply(lambda r: read_wavfile(r['path'], 'dsl_data/'), axis=1, result_type='expand')

In [7]:
df_dev.head()

Unnamed: 0_level_0,path,speakerId,action,object,Self-reported fluency level,First Language spoken,Current language used for work/school,gender,ageRange,sample_rate,length,no_sample,channel,data
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/0a312...,2BqVo8kVB2Skwgyb,change language,none,advanced,English (United States),English (United States),female,22-40,16000,1.857625,29722,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/0ee42...,2BqVo8kVB2Skwgyb,activate,music,advanced,English (United States),English (United States),female,22-40,16000,1.393188,22291,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/1d9f3...,2BqVo8kVB2Skwgyb,deactivate,lights,advanced,English (United States),English (United States),female,22-40,16000,1.9505,31208,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/269fc...,2BqVo8kVB2Skwgyb,increase,volume,advanced,English (United States),English (United States),female,22-40,16000,1.764687,28235,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/5bbda...,2BqVo8kVB2Skwgyb,increase,volume,advanced,English (United States),English (United States),female,22-40,16000,2.414875,38638,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [8]:
df_dev['class'] = df_dev['action'] + df_dev['object'].str.replace('none','')

In [9]:
df_dev.drop(['path', 'action', 'object'], axis=1, inplace=True)

In [10]:
df_dev.head()

Unnamed: 0_level_0,speakerId,Self-reported fluency level,First Language spoken,Current language used for work/school,gender,ageRange,sample_rate,length,no_sample,channel,data,class
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,2BqVo8kVB2Skwgyb,advanced,English (United States),English (United States),female,22-40,16000,1.857625,29722,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",change language
1,2BqVo8kVB2Skwgyb,advanced,English (United States),English (United States),female,22-40,16000,1.393188,22291,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",activatemusic
2,2BqVo8kVB2Skwgyb,advanced,English (United States),English (United States),female,22-40,16000,1.9505,31208,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",deactivatelights
3,2BqVo8kVB2Skwgyb,advanced,English (United States),English (United States),female,22-40,16000,1.764687,28235,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",increasevolume
4,2BqVo8kVB2Skwgyb,advanced,English (United States),English (United States),female,22-40,16000,2.414875,38638,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",increasevolume


In [11]:
df_dev['channel'].unique()

array([1], dtype=int64)

In [12]:
df_dev.drop('channel', axis=1, inplace=True)

In [None]:
df_dev.head()

Unnamed: 0_level_0,speakerId,Self-reported fluency level,First Language spoken,Current language used for work/school,gender,ageRange,sample_rate,length,no_sample,data,class
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,2BqVo8kVB2Skwgyb,advanced,English (United States),English (United States),female,22-40,16000,1.857625,29722,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",change language
1,2BqVo8kVB2Skwgyb,advanced,English (United States),English (United States),female,22-40,16000,1.393188,22291,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",activatemusic
2,2BqVo8kVB2Skwgyb,advanced,English (United States),English (United States),female,22-40,16000,1.9505,31208,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",deactivatelights
3,2BqVo8kVB2Skwgyb,advanced,English (United States),English (United States),female,22-40,16000,1.764687,28235,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",increasevolume
4,2BqVo8kVB2Skwgyb,advanced,English (United States),English (United States),female,22-40,16000,2.414875,38638,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",increasevolume


In [194]:
def cal_mean_std_blocks(wav_file_info, n, m):
    
    _, _, zxx = stft(wav_file_info)
    zxx = abs(zxx)
    zxx = zxx.T
    
    means = []
    stds = []
    
    for t_segment in np.array_split(zxx, n):
        for f_segment in np.array_split(t_segment, m):
            means.append(np.mean(f_segment))
            stds.append(np.std(f_segment))
            
    return means, stds

In [346]:
no_time_split = 8
no_freq_split = 8
df_dev[['bls_mean', 'bls_std']] = df_dev.apply(lambda r: cal_mean_std_blocks(r['data'], no_time_split, no_freq_split), axis=1, result_type='expand')

In [347]:
mean_cols = [f'mean_block{i}' for i in range(no_time_split*no_freq_split)]
df_means = pd.DataFrame(df_dev['bls_mean'].to_list(), columns=mean_cols)

std_cols = [f'std_block{i}' for i in range(no_time_split*no_freq_split)]
df_stds = pd.DataFrame(df_dev['bls_std'].to_list(), columns=std_cols)

In [348]:
df_mean_std.head()

Unnamed: 0,mean_block0,mean_block1,mean_block2,mean_block3,mean_block4,mean_block5,mean_block6,mean_block7,mean_block8,mean_block9,...,std_block16,std_block17,std_block18,std_block19,std_block20,std_block21,std_block22,std_block23,std_block24,class
0,0.100435,0.070277,0.062315,0.130807,1.970491,21.76763,50.42664,32.568481,15.80205,20.866478,...,48.59708,41.620243,20.223986,8.963957,6.459994,2.684572,0.747801,0.076364,0.108688,change language
1,0.044593,0.225358,0.128988,0.047846,2.718322,18.060852,16.116852,11.583518,13.868074,20.262104,...,7.217708,2.315052,1.973218,1.487104,1.82388,0.440556,0.083418,0.08284,0.043447,activatemusic
2,0.047823,0.059638,0.049946,0.056581,0.06003,0.069523,3.700343,2.000903,0.849942,0.267601,...,17.327511,7.293402,1.367586,12.28073,5.015636,3.765346,1.203516,0.357674,0.96825,deactivatelights
3,0.047141,0.071675,0.054661,0.049124,0.053018,0.470069,9.034513,33.770851,42.587677,18.373556,...,0.219264,140.90094,300.181702,31.719004,0.478694,5.698944,1.057382,0.111497,0.10455,increasevolume
4,0.05462,0.141274,0.041251,0.042108,0.031819,0.032132,0.057714,2.333204,11.196895,7.592543,...,27.33699,22.748116,0.144054,0.143901,0.240463,51.050827,0.93361,0.530071,0.078218,increasevolume


In [349]:
y = df_mean_std['class']
X = df_mean_std.drop(['class'], axis=1)

In [350]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [351]:
rf = RandomForestClassifier(n_estimators=200).fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.4165398274987316