In [1]:
import os
import glob
import librosa
import matplotlib.pyplot as plt
import numpy as np
import sys
from tqdm import tqdm
from scipy.spatial.distance import cosine
import pandas as pd

In [2]:
audio_path_unlabeled = glob.glob(os.path.join('C:./dataset/audioonly/unlabeled', '*.wav'))
dataset_maindir = os.path.join(os.getcwd(), 'dataset')
sub_dir_name = next(os.walk(dataset_maindir))[1]
sub_dir_name

['.ipynb_checkpoints', 'audioonly', 'spectrograms']

In [3]:
sub_dir_path = os.path.join(dataset_maindir, sub_dir_name[1])
sub_dir_subpath = next(os.walk(sub_dir_path))[1]
sub_dir_subpath

['labeled', 'unlabeled']

In [4]:
class_folder_path = os.path.join(sub_dir_path, sub_dir_subpath[0])
class_lst = next(os.walk(class_folder_path))[1]
class_path = [os.path.join(class_folder_path, type) for type in class_lst]
class_path

['C:\\Users\\dave\\aiffel\\EUANGGG\\maincode\\data\\dataset\\audioonly\\labeled\\belly_pain',
 'C:\\Users\\dave\\aiffel\\EUANGGG\\maincode\\data\\dataset\\audioonly\\labeled\\burping',
 'C:\\Users\\dave\\aiffel\\EUANGGG\\maincode\\data\\dataset\\audioonly\\labeled\\discomfort',
 'C:\\Users\\dave\\aiffel\\EUANGGG\\maincode\\data\\dataset\\audioonly\\labeled\\hungry',
 'C:\\Users\\dave\\aiffel\\EUANGGG\\maincode\\data\\dataset\\audioonly\\labeled\\tired']

In [5]:
audio_path_labeled = [glob.glob(os.path.join(folder,'*.wav')) for folder in class_path]

In [6]:
bellypain_path = audio_path_labeled[0]
burping_path = audio_path_labeled[1]
discomfort_path = audio_path_labeled[2]
hungry_path = audio_path_labeled[3]
tired_path = audio_path_labeled[4]

In [7]:
# Bellypain audio to MFCCs
spectro_belly = []

for path in bellypain_path:
    y, sr = librosa.load(path, sr = 44100)
    mfccs = librosa.feature.mfcc(y=y, sr=sr)
    spectro_belly.append(mfccs)

In [8]:
# burping audio to MFCCs
spectro_burp = []

for path in burping_path:
    y, sr = librosa.load(path, sr = 44100)
    mfccs = librosa.feature.mfcc(y=y, sr=sr)
    spectro_burp.append(mfccs)

In [9]:
# discomfort audio to MFCCs
spectro_discomfort = []

for path in discomfort_path:
    y, sr = librosa.load(path, sr = 44100)
    mfccs = librosa.feature.mfcc(y=y, sr=sr)
    spectro_discomfort.append(mfccs)

In [10]:
# hungry audio to MFCCs
spectro_hungry = []

for path in hungry_path:
    y, sr = librosa.load(path, sr = 44100)
    mfccs = librosa.feature.mfcc(y=y, sr=sr)
    spectro_hungry.append(mfccs)

In [11]:
# tired audio to MFCCs
spectro_tired = []

for path in tired_path:
    y, sr = librosa.load(path, sr = 44100)
    mfccs = librosa.feature.mfcc(y=y, sr=sr)
    spectro_tired.append(mfccs)

In [12]:
# tired audio to MFCCs
spectro_unlabeled = []

for path in tqdm(audio_path_unlabeled):
    y, sr = librosa.load(path, sr = 44100)
    mfccs = librosa.feature.mfcc(y=y, sr=sr)
    spectro_unlabeled.append(mfccs)

100%|███████████████████████████████████████████████████████████████████████████| 18190/18190 [01:42<00:00, 176.90it/s]


In [13]:
# class 별 shape 체크
shape_count = {}
total_mfcc = [spectro_belly,
              spectro_burp,
              spectro_discomfort,
              spectro_tired,
              spectro_hungry,
              spectro_unlabeled]

for mfcc_lst in total_mfcc:
    for mfcc in mfcc_lst:
        shape = mfcc.shape
        shape_count[shape] = shape_count.get(shape, 0) + 1

print(shape_count)

{(20, 591): 23, (20, 603): 182, (20, 595): 32, (20, 600): 24, (20, 588): 7, (20, 590): 9, (20, 574): 15, (20, 564): 3, (20, 576): 8, (20, 609): 18, (20, 562): 2, (20, 597): 40, (20, 584): 20, (20, 598): 27, (20, 593): 12, (20, 602): 2, (20, 579): 6, (20, 607): 5, (20, 581): 6, (20, 583): 4, (20, 571): 3, (20, 572): 8, (20, 605): 2, (20, 74): 201, (20, 90): 132, (20, 96): 100, (20, 153): 22, (20, 83): 177, (20, 150): 27, (20, 64): 266, (20, 67): 246, (20, 51): 240, (20, 55): 266, (20, 61): 281, (20, 70): 242, (20, 147): 23, (20, 77): 179, (20, 48): 233, (20, 58): 273, (20, 93): 142, (20, 69): 232, (20, 63): 261, (20, 68): 267, (20, 42): 169, (20, 40): 158, (20, 71): 243, (20, 46): 197, (20, 103): 107, (20, 39): 130, (20, 49): 201, (20, 53): 271, (20, 22): 80, (20, 36): 115, (20, 72): 218, (20, 52): 267, (20, 87): 139, (20, 38): 130, (20, 54): 288, (20, 44): 209, (20, 76): 212, (20, 33): 109, (20, 187): 5, (20, 105): 81, (20, 85): 167, (20, 73): 202, (20, 75): 183, (20, 106): 90, (20, 84

In [28]:
shape_lst = list(shape_count.keys())

min_value = 20 * 8

for x, y in shape_lst:
    product = x * y
    if product < min_value:
        min_value = product
        print(x, y)

## **MFCC 리사이즈 및 벡터화**

In [29]:
# labeled 와 unlabeled 리사이징 및 백터화
spectro_belly_resized = [np.array(mfcc).flatten()[:160] for mfcc in spectro_belly]
spectro_burp_resized = [np.array(mfcc).flatten()[:160] for mfcc in spectro_burp]
spectro_discomfort_resized = [np.array(mfcc).flatten()[:160] for mfcc in spectro_discomfort]
spectro_hungry_resized = [np.array(mfcc).flatten()[:160] for mfcc in spectro_hungry]
spectro_tired_resized = [np.array(mfcc).flatten()[:160] for mfcc in spectro_tired]
spectro_unlabeled_resized = [np.array(mfcc).flatten()[:160] for mfcc in spectro_unlabeled]

## **코사인 유사도 계산**

In [30]:
# 평균벡터 추출
def mean_vector(vectors):
    return np.mean(vectors, axis=0)

mean_bellypain = mean_vector(spectro_belly_resized)
mean_burp = mean_vector(spectro_burp_resized)
mean_discomfort = mean_vector(spectro_discomfort_resized)
mean_hungry = mean_vector(spectro_hungry_resized)
mean_tired = mean_vector(spectro_tired_resized)

In [31]:
# 코사인 유사도 계산
def cosine_similarity(vecA, vecB):
    return 1 - cosine(vecA, vecB)
    
similarity_result = []

for idx, unlabeled_vector in enumerate(spectro_unlabeled_resized):
    similarity_bellypain = cosine_similarity(unlabeled_vector, mean_bellypain)
    similarity_burp = cosine_similarity(unlabeled_vector, mean_burp)
    similarity_discomfort = cosine_similarity(unlabeled_vector, mean_discomfort)
    similarity_hungry = cosine_similarity(unlabeled_vector, mean_hungry)
    similarity_tired = cosine_similarity(unlabeled_vector, mean_tired)
    
    # 결과 값을 딕셔너리로 저장
    similarity_result.append({
        'Index': f'unlabeled_audio{idx}',
        'Bellypain': similarity_bellypain,
        'Burp': similarity_burp,
        'Discomfort': similarity_discomfort,
        'Hungry': similarity_hungry,
        'Tired': similarity_tired
    })

In [32]:
sim_df = pd.DataFrame(similarity_result)
sim_df

Unnamed: 0,Index,Bellypain,Burp,Discomfort,Hungry,Tired
0,unlabeled_audio0,0.545146,0.593167,0.553062,0.570133,0.538193
1,unlabeled_audio1,0.623908,0.651351,0.614243,0.638407,0.613302
2,unlabeled_audio2,0.657007,0.680377,0.642860,0.667123,0.644799
3,unlabeled_audio3,0.962085,0.957416,0.961671,0.962727,0.962675
4,unlabeled_audio4,0.529136,0.564362,0.526292,0.547657,0.519655
...,...,...,...,...,...,...
18185,unlabeled_audio18185,0.636217,0.655606,0.618666,0.643155,0.623578
18186,unlabeled_audio18186,0.537855,0.564344,0.523123,0.549603,0.524333
18187,unlabeled_audio18187,0.609334,0.629066,0.592047,0.616463,0.597207
18188,unlabeled_audio18188,0.610398,0.636664,0.596575,0.620599,0.598786


In [41]:
cond = (sim_df['Bellypain'] > 0.90) & (sim_df['Burp'] > 0.90) & (sim_df['Discomfort'] > 0.90) & (sim_df['Hungry'] > 0.90) & (sim_df['Tired'] > 0.90)
filtered_df = sim_df[cond]

In [42]:
filtered_df

Unnamed: 0,Index,Bellypain,Burp,Discomfort,Hungry,Tired
3,unlabeled_audio3,0.962085,0.957416,0.961671,0.962727,0.962675
5,unlabeled_audio5,0.944832,0.927460,0.939932,0.938659,0.945168
18,unlabeled_audio18,0.932029,0.919998,0.923365,0.927707,0.931157
54,unlabeled_audio54,0.952810,0.950004,0.947300,0.953258,0.950698
70,unlabeled_audio70,0.977436,0.957842,0.976314,0.970411,0.980005
...,...,...,...,...,...,...
18045,unlabeled_audio18045,0.996839,0.986044,0.994445,0.993346,0.996736
18046,unlabeled_audio18046,0.923675,0.915510,0.915523,0.921099,0.922427
18138,unlabeled_audio18138,0.998924,0.994205,0.996667,0.998290,0.998214
18152,unlabeled_audio18152,0.937940,0.926883,0.934944,0.934607,0.938041


In [43]:
# Select only numeric columns for the operation
numeric_df = filtered_df.select_dtypes(include=[np.number])

# Find the column name with the maximum value for each row
filtered_df['Max_Column'] = numeric_df.idxmax(axis=1)

# Find the maximum value for each row
filtered_df['Max_Value'] = numeric_df.max(axis=1)
filtered_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Max_Column'] = numeric_df.idxmax(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Max_Value'] = numeric_df.max(axis=1)


Unnamed: 0,Index,Bellypain,Burp,Discomfort,Hungry,Tired,Max_Column,Max_Value
3,unlabeled_audio3,0.962085,0.957416,0.961671,0.962727,0.962675,Hungry,0.962727
5,unlabeled_audio5,0.944832,0.927460,0.939932,0.938659,0.945168,Tired,0.945168
18,unlabeled_audio18,0.932029,0.919998,0.923365,0.927707,0.931157,Bellypain,0.932029
54,unlabeled_audio54,0.952810,0.950004,0.947300,0.953258,0.950698,Hungry,0.953258
70,unlabeled_audio70,0.977436,0.957842,0.976314,0.970411,0.980005,Tired,0.980005
...,...,...,...,...,...,...,...,...
18045,unlabeled_audio18045,0.996839,0.986044,0.994445,0.993346,0.996736,Bellypain,0.996839
18046,unlabeled_audio18046,0.923675,0.915510,0.915523,0.921099,0.922427,Bellypain,0.923675
18138,unlabeled_audio18138,0.998924,0.994205,0.996667,0.998290,0.998214,Bellypain,0.998924
18152,unlabeled_audio18152,0.937940,0.926883,0.934944,0.934607,0.938041,Tired,0.938041


In [44]:
counts = filtered_df['Max_Column'].value_counts()

In [45]:
counts

Max_Column
Bellypain     647
Tired         382
Hungry        145
Discomfort     96
Burp           26
Name: count, dtype: int64

In [46]:
prev_count = {
    '복통': len(bellypain_path),
    '트림': len(burping_path),
    '불편함': len(discomfort_path),
    '배고픔': len(hungry_path),
    '피곤함': len(tired_path)
}

prev_count_df = pd.DataFrame(list(prev_count.items()), columns=['종류', 'Count'])
prev_count_df

Unnamed: 0,종류,Count
0,복통,16
1,트림,8
2,불편함,27
3,배고픔,382
4,피곤함,24


In [47]:
data_counts = {
    '복통': len(bellypain_path) + counts['Bellypain'],
    '트림': len(burping_path) + counts['Burp'],
    '불편함': len(discomfort_path) + counts['Discomfort'],
    '배고픔': len(hungry_path) + counts['Hungry'],
    '피곤함': len(tired_path) + counts['Tired']
}

datacount_df = pd.DataFrame(list(data_counts.items()), columns=['종류', 'Count'])
datacount_df

Unnamed: 0,종류,Count
0,복통,663
1,트림,34
2,불편함,123
3,배고픔,527
4,피곤함,406


In [48]:
print("유사도 검사 전 데이터 수: {}".format(prev_count_df['Count'].sum()))
print("데이터 편입했을 경우 데이터 수: {}".format(datacount_df['Count'].sum()))

유사도 검사 전 데이터 수: 457
데이터 편입했을 경우 데이터 수: 1753
