In [25]:
!pip install librosa
!pip install transformers==4.28.0 datasets evaluate
!pip install xgboost
!pip install cmake



Running `brew update --auto-update`...
[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 2 taps (homebrew/core and homebrew/cask).
[34m==>[0m [1mNew Formulae[0m
ansible@7                                melange
apko                                     mods
aws-amplify                              nexttrace
bashate                                  openfga
boolector                                procps@3
clive                                    roblox-ts
fastgron                                 rojo
git-credential-oauth                     rye
joshuto                                  shodan
libabigail                               spotify_player
libecpint                                swift-outdated
libfastjson                              tailwindcss-language-server
libint                                   tern
libmediainfo                             votca
libomemo-c                               wzprof
libpaho-mqtt                             yamlfmt
libzen
[34m==>[0m [1mNew

In [27]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
import evaluate
import librosa
import random
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from datasets import load_dataset, Audio
from sklearn.metrics import accuracy_score
from xgboost.sklearn import XGBClassifier
from lightgbm import LGBMClassifier

ModuleNotFoundError: No module named 'lightgbm'

In [2]:
CFG = {
    'SR':16000,
    'N_MFCC':32, # Melspectrogram 벡터를 추출할 개수
    'SEED':42
}

In [3]:
train_df = pd.read_csv('./train.csv')
print(train_df.info())
print()
test_df = pd.read_csv('./test.csv')
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5001 non-null   object
 1   path    5001 non-null   object
 2   label   5001 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 117.3+ KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      1881 non-null   object
 1   path    1881 non-null   object
dtypes: object(2)
memory usage: 29.5+ KB
None


In [4]:
# Folder Locations
dataset = "./"

In [5]:
def get_mfcc_feature(df):
    features = []
    for path in tqdm(df['path']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])
        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)

    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    return mfcc_df


In [6]:
train_x = get_mfcc_feature(train_df)
print(train_x)
test_x = get_mfcc_feature(test_df)
print(test_x)

  0%|          | 0/5001 [00:00<?, ?it/s]

          mfcc_1      mfcc_2     mfcc_3     mfcc_4     mfcc_5    mfcc_6  \
0    -414.755737  110.100639  46.699074  23.939814  14.766221  4.820827   
1    -399.769531   83.051300  55.473316  31.782587  22.040754  0.985082   
2    -341.145081   97.399071  38.274349  19.811539   0.731027  0.838704   
3    -376.963715  118.961670  34.490349  24.178417  -1.065604 -1.613391   
4    -352.863220  117.553337  29.948687  31.094315   5.406391 -5.591998   
...          ...         ...        ...        ...        ...       ...   
4996 -416.181305  112.938484  47.294231  23.111433  16.048231  6.795750   
4997 -237.811432   72.207787   3.371584  14.646128 -10.546066  7.671333   
4998 -368.228119  103.887871  40.081417  21.763754  14.120455  5.778781   
4999 -407.114288  103.880676  44.980690  23.265039  13.279220  6.904113   
5000 -359.754608   89.241714  20.517361  24.043827  16.926319 -7.636394   

        mfcc_7     mfcc_8    mfcc_9    mfcc_10  ...   mfcc_23   mfcc_24  \
0     1.445079  -0.92615

  0%|          | 0/1881 [00:00<?, ?it/s]

          mfcc_1      mfcc_2     mfcc_3     mfcc_4     mfcc_5    mfcc_6  \
0    -335.757324  125.215431  22.145767  14.351713  -1.045251  0.567860   
1    -295.973053   92.839684  24.976181  22.831310 -10.278670  7.804742   
2    -444.395996  109.385201  55.236771  26.486050  12.487712  9.069915   
3    -384.600220  112.463974  47.454556  17.336460  13.556947  4.677102   
4    -273.304077   97.817047  12.370095  24.591480   3.248469 -9.987856   
...          ...         ...        ...        ...        ...       ...   
1876 -250.254913   69.372955  17.328987  12.005389  -2.463175 -7.652928   
1877 -348.593842  107.067047  27.683287  17.207047   2.634121  1.812742   
1878 -295.658112   99.606911  -9.669126   4.676853   4.752311 -5.689676   
1879 -450.677094  122.865677  46.413559  21.616255  13.785479  5.433998   
1880 -366.914154  113.387276  43.263287  15.806440   8.695131  6.917760   

        mfcc_7     mfcc_8    mfcc_9    mfcc_10  ...   mfcc_23   mfcc_24  \
0    -0.666742  -8.11483

In [8]:
print(train_x.info())
print('\n \n \n')
print(test_x.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 32 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   mfcc_1   5001 non-null   float32
 1   mfcc_2   5001 non-null   float32
 2   mfcc_3   5001 non-null   float32
 3   mfcc_4   5001 non-null   float32
 4   mfcc_5   5001 non-null   float32
 5   mfcc_6   5001 non-null   float32
 6   mfcc_7   5001 non-null   float32
 7   mfcc_8   5001 non-null   float32
 8   mfcc_9   5001 non-null   float32
 9   mfcc_10  5001 non-null   float32
 10  mfcc_11  5001 non-null   float32
 11  mfcc_12  5001 non-null   float32
 12  mfcc_13  5001 non-null   float32
 13  mfcc_14  5001 non-null   float32
 14  mfcc_15  5001 non-null   float32
 15  mfcc_16  5001 non-null   float32
 16  mfcc_17  5001 non-null   float32
 17  mfcc_18  5001 non-null   float32
 18  mfcc_19  5001 non-null   float32
 19  mfcc_20  5001 non-null   float32
 20  mfcc_21  5001 non-null   float32
 21  mfcc_22  5001 

In [9]:
train_y = train_df['label']
print(train_y)

0       1
1       2
2       4
3       5
4       4
       ..
4996    5
4997    0
4998    1
4999    1
5000    4
Name: label, Length: 5001, dtype: int64


In [None]:
model = DecisionTreeClassifier(random_state=CFG['SEED'])
model.fit(train_x, train_y)

In [None]:
preds = model.predict(test_x)

In [None]:
submission = pd.read_csv(dataset + 'sample_submission.csv')
submission['label'] = preds
submission.to_csv(dataset + "baseline_submission.csv", index=False)

In [11]:
rf = RandomForestClassifier(n_estimators=50,
                           max_depth=4,
                           min_samples_split=2,
                           max_features=0.85,
                           n_jobs=-1,
                           random_state=CFG['SEED'])

In [16]:
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2, random_state=CFG['SEED'])
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(4000, 32) (1001, 32) (4000,) (1001,)


In [12]:
rf.fit(train_x, train_y)

In [17]:
print("-- Random Forest --")
print("Train ACC : %.3f" % accuracy_score(y_train, rf.predict(X_train)))
print("Val ACC : %.3f" % accuracy_score(y_val, rf.predict(X_val)))


-- Random Forest --
Train ACC : 0.423
Val ACC : 0.431


In [19]:
X_test = pd.get_dummies(data=test_x)
pred = rf.predict(X_test)
pred

array([5, 0, 2, ..., 0, 2, 4])

In [20]:
submission = pd.read_csv(dataset + 'sample_submission.csv')
submission['label'] = pred
submission.to_csv(dataset + "baseline_submission.csv", index=False)