In [None]:
import pandas as pd
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

<h1> Loading Development Dataset </h1>

In [5]:
df_dev = pd.read_csv('development.csv', index_col=0)

In [6]:
df_dev.head()

Unnamed: 0_level_0,path,speakerId,action,object,Self-reported fluency level,First Language spoken,Current language used for work/school,gender,ageRange
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/0a312...,2BqVo8kVB2Skwgyb,change language,none,advanced,English (United States),English (United States),female,22-40
1,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/0ee42...,2BqVo8kVB2Skwgyb,activate,music,advanced,English (United States),English (United States),female,22-40
2,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/1d9f3...,2BqVo8kVB2Skwgyb,deactivate,lights,advanced,English (United States),English (United States),female,22-40
3,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/269fc...,2BqVo8kVB2Skwgyb,increase,volume,advanced,English (United States),English (United States),female,22-40
4,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/5bbda...,2BqVo8kVB2Skwgyb,increase,volume,advanced,English (United States),English (United States),female,22-40


<h1> Preprocess </h1>

In [7]:
df_dev.drop(['speakerId', 'Self-reported fluency level ', 'First Language spoken', 'Current language used for work/school',
            'gender', 'ageRange'], axis=1, inplace=True)

In [8]:
def load_file(path):

    y, sr = librosa.load(path)
    y, _ = librosa.effects.trim(y=y, top_db=10, hop_length=10)
    duration = librosa.get_duration(y=y, sr=sr)
        
    return y, duration

In [9]:
df_dev[['info', 'duration']] = df_dev.apply(lambda r: load_file(r['path']), axis=1, result_type='expand')

In [11]:
df_dev.drop('path', axis=1, inplace=True)

In [12]:
def map_duration(duration):
    
    if 0 <= duration < 1:
        return '0-1'
    elif 1 <= duration < 2:
        return '1-2'
    elif 2 <= duration < 3:
        return '2-3'
    elif 3 <= duration < 4:
        return '3-4'
    elif 4 <= duration < 5:
        return '4-5'
    else:
        return '>=5'

In [13]:
df_dev['duration_interval'] = df_dev.apply(lambda r: map_duration(r['duration']), axis=1)

In [14]:
df_dev['duration_interval'].value_counts()

0-1    7168
1-2    2451
2-3     208
3-4      25
>=5       2
Name: duration_interval, dtype: int64

In [15]:
print('Mean of duration -> ', df_dev['duration'].mean())
print('Standard Deviation of duration -> ', df_dev['duration'].std())
print('Max of duration -> ', df_dev['duration'].max())
print('Min of duration -> ', df_dev['duration'].min())

Mean of duration ->  0.8714597937138457
Standard Deviation of duration ->  0.41307107664575854
Max of duration ->  6.834920634920635
Min of duration ->  0.11156462585034013


In [16]:
df_dev.drop('duration_interval', axis=1, inplace=True)

In [17]:
def remove_outliers(df, threshold):
    
    print("Old Shape: ", df.shape)

    upper = np.where(df['duration'] >= threshold) 
    df.drop(upper[0], inplace = True)

    print("New Shape: ", df.shape)
    
    return df

In [18]:
df_dev = remove_outliers(df_dev, 3)

Old Shape:  (9854, 4)
New Shape:  (9827, 4)


In [19]:
df_dev.drop('duration', axis=1, inplace=True)

In [20]:
def add_padding(info, upper_bound, forTest=False):
    
    if forTest: # for test set
      
      if len(info) > upper_bound: # crop the file
        info = info[:upper_bound]
      else:
        pad = np.zeros(upper_bound - len(info), dtype=np.float32)
        info = np.concatenate([info, pad],0)
      
    else: # for train set
      pad = np.zeros(upper_bound - len(info), dtype=np.float32)
      info = np.concatenate([info, pad],0)
    
    return info

In [21]:
info_length = [ len(info) for info in df_dev['info'] ]
max_length = max(info_length)

In [22]:
df_dev['info'] = df_dev.apply(lambda r: add_padding(r['info'], max_length), axis=1)

In [23]:
df_dev['mfcc'] = df_dev.apply(lambda r: librosa.feature.mfcc(y=r['info'], sr=22050), axis=1)

In [25]:
#def_dev_backup = df_dev.copy()

In [26]:
#df_dev = def_dev_backup.copy()

In [27]:
df_dev.drop('info', axis=1, inplace=True)

In [29]:
df_dev['mfcc'] = df_dev.apply(lambda r: r['mfcc'].flatten(), axis=1)

In [30]:
df_dev.iloc[0]['mfcc'].shape

(2600,)

In [31]:
col_names = [f'mfcc{i}' for i in range(2600)]
df_new_features = pd.DataFrame(df_dev['mfcc'].tolist(), index=df_dev.index, columns=col_names)
df_dev = pd.concat((df_dev, df_new_features), axis=1)

In [32]:
df_dev.drop(['mfcc'], axis=1, inplace=True)

In [125]:
y = df_dev['action'] + df_dev['object']
X = df_dev.drop(['action', 'object'], axis=1)

In [126]:
pca_scaler = PCA(n_components=50).fit(X)

In [127]:
X_PCA = pca_scaler.transform(X)

In [128]:
scaler = StandardScaler().fit(X_PCA)

In [129]:
X_transformed = scaler.transform(X_PCA)

In [133]:
X_train, X_val, y_train, y_val = train_test_split(X_transformed, y, test_size=0.2, shuffle=True, stratify=y)

<h1> Simple Model </h1>

In [134]:
svc = SVC(kernel='rbf', C=4, gamma='scale', class_weight='balanced').fit(X_train, y_train)
y_pred = svc.predict(X_val)
accuracy_score(y_val, y_pred)

0.8894098179812465

<h1> Train on Whole the Dataset </h1>

In [135]:
final_svc = SVC(kernel='rbf', C=4, gamma='scale', class_weight='balanced').fit(X_transformed, y)
y_pred = svc.predict(X_transformed)
accuracy_score(y, y_pred)

0.968450082735797

<h1> Submission </h1>

In [55]:
df_test = pd.read_csv('evaluation.csv', index_col=0)

In [56]:
df_test[['info', 'duration']] = df_test.apply(lambda r: load_file(r['path']), axis=1, result_type='expand')

In [57]:
df_test = df_test[['info', 'duration']]

In [58]:
df_test['info'] = df_test.apply(lambda r: add_padding(r['info'], max_length, forTest=True), axis=1)

In [59]:
df_test['mfcc'] = df_test.apply(lambda r: librosa.feature.mfcc(y=r['info'], sr=22050), axis=1)

In [60]:
df_test.drop('duration', axis=1, inplace=True)

In [61]:
df_test['mfcc'] = df_test.apply(lambda r: r['mfcc'].flatten(), axis=1)

In [62]:
df_test.iloc[0]['mfcc'].shape

(2600,)

In [63]:
col_names = [f'mfcc{i}' for i in range(2600)]
df_new_features = pd.DataFrame(df_test['mfcc'].tolist(), index=df_test.index, columns=col_names)
df_test = pd.concat((df_test, df_new_features), axis=1)

In [64]:
df_test.drop(['info', 'mfcc'], axis=1, inplace=True)

In [65]:
X_test = pca_scaler.transform(df_test)

In [66]:
X_test = scaler.transform(X_test)

In [136]:
y_test = final_svc.predict(X_test)

In [137]:
test_index = [i for i in range(len(y_test))]
submit = pd.DataFrame(y_test, index=test_index, columns=['Predicted'])

In [138]:
submit.to_csv('submit.csv')