### Кластеризация дыхания по активностям

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import librosa
import glob
from itertools import chain


### Загрузка файлов

In [4]:
activities = ['Dyspnea', 'Running', 'Sitting', 'Squats', 'Standing', 'Walking']
wavs = {}
for activity in activities:
    wavs[activity] = glob.glob(f'../web_recordings/{activity}/audio/*.wav')

wavs_list = list(chain(*wavs.values()))
wavs_list

['../web_recordings/Dyspnea/audio/Dyspnea_exhale_3.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_exhale_2.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_exhale_1.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_exhale_5.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_exhale_4.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_exhale_6.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_exhale_7.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_inhale_6.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_inhale_7.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_inhale_5.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_inhale_4.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_inhale_1.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_inhale_3.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_inhale_2.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_inhale_9.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_inhale_8.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_exhale_9.wav',
 '../web_recordings/Dyspnea/aud

In [7]:
from tools import get_features

df = get_features.get_features_frame(wavs_list, 2)
df = df.transpose()
df

../web_recordings/Dyspnea/audio/Dyspnea_exhale_3.wav duration = 2.04 seconds
40 frames, 68 short-term features
../web_recordings/Dyspnea/audio/Dyspnea_exhale_2.wav duration = 2.1 seconds
42 frames, 68 short-term features
../web_recordings/Dyspnea/audio/Dyspnea_exhale_1.wav duration = 2.22 seconds
44 frames, 68 short-term features
../web_recordings/Dyspnea/audio/Dyspnea_exhale_5.wav duration = 2.4 seconds
48 frames, 68 short-term features
../web_recordings/Dyspnea/audio/Dyspnea_exhale_4.wav duration = 2.28 seconds
45 frames, 68 short-term features
../web_recordings/Dyspnea/audio/Dyspnea_exhale_6.wav duration = 1.98 seconds
39 frames, 68 short-term features
../web_recordings/Dyspnea/audio/Dyspnea_exhale_7.wav duration = 2.4 seconds
48 frames, 68 short-term features
../web_recordings/Dyspnea/audio/Dyspnea_inhale_6.wav duration = 2.04 seconds
40 frames, 68 short-term features
../web_recordings/Dyspnea/audio/Dyspnea_inhale_7.wav duration = 2.1 seconds
42 frames, 68 short-term features
../we

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,126,127,128,129,130,131,132,133,134,135
Dyspnea_exhale_3.wav,0.0,0.012806,3.317673,0.002670,0.029585,0.000113,0.0,0.0,-45.334891,0.490578,...,-6.866427e-11,1.390560e-08,3.490652e-12,6.858661e-10,1.125015e-09,5.993740e-11,6.627394e-11,8.946676e-11,-4.669298e-11,3.794662e-09
Dyspnea_exhale_2.wav,0.0,0.000804,3.317594,0.007379,0.062256,0.000188,0.0,0.0,-46.882612,-0.084901,...,1.275051e-08,-1.102144e-05,1.612212e-08,1.566443e-05,6.884011e-08,6.282312e-08,7.346283e-07,2.736374e-06,1.169237e-09,-3.734947e-06
Dyspnea_exhale_1.wav,0.0,0.010515,3.317643,0.002303,0.028619,0.000002,0.0,0.0,-46.813120,-0.120158,...,-5.702810e-11,-3.742263e-08,-1.501705e-11,2.361751e-10,2.869218e-10,3.179822e-11,7.285976e-11,8.526325e-11,-7.739882e-11,-1.036263e-08
Dyspnea_exhale_5.wav,0.0,0.012740,3.317729,0.002582,0.029792,0.000094,0.0,0.0,-47.010755,-0.054928,...,-3.329055e-11,-1.533664e-08,-2.336839e-11,-3.190830e-10,-2.435295e-10,-1.918749e-11,1.722668e-11,5.328989e-12,2.177929e-11,-4.223393e-09
Dyspnea_exhale_4.wav,0.0,0.000469,3.317633,0.006905,0.056993,0.001146,0.0,0.0,-48.923292,-0.788085,...,2.846411e-10,-1.914558e-07,1.725791e-11,8.306831e-10,-1.147676e-09,-3.202150e-10,1.135248e-10,2.381244e-11,-6.104102e-11,-5.291137e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Walking_exhale_10.wav,0.0,0.002698,3.317677,0.003698,0.039918,0.000023,0.0,0.0,-46.017261,0.186515,...,3.020963e-06,-2.703819e-05,3.153482e-06,4.461459e-05,-7.424396e-09,7.283267e-07,2.797163e-07,1.916602e-06,1.442800e-08,-9.399886e-06
Walking_inhale_9.wav,0.0,0.008702,3.317718,0.002517,0.030538,0.000007,0.0,0.0,-46.547248,0.684925,...,-1.148940e-10,-5.683385e-08,-1.174590e-10,-2.165971e-09,-3.093970e-10,1.544577e-11,-7.935396e-11,-4.620005e-11,-6.834915e-11,-1.558203e-08
Walking_inhale_8.wav,0.0,0.011694,3.317635,0.002565,0.029961,0.000010,0.0,0.0,-43.101846,1.685298,...,3.755436e-09,-1.988819e-07,-1.492684e-09,6.078296e-08,7.596586e-09,1.855473e-09,8.679270e-09,2.853362e-08,1.984137e-09,-6.056441e-08
Walking_exhale_9.wav,0.0,0.006850,3.317664,0.003172,0.035909,0.000034,0.0,0.0,-45.430178,0.337116,...,-5.669973e-10,9.515602e-09,3.274330e-10,5.941010e-09,4.434255e-09,2.412263e-10,1.516937e-11,4.115886e-11,1.528110e-10,2.354592e-09


In [8]:
df_clusterize = df.copy()
for dropped_columns in [0, 6, 7]:
    df_clusterize = df_clusterize.drop(dropped_columns, axis=1)

for dropped_columns in range(21, 33):
    df_clusterize = df_clusterize.drop(dropped_columns, axis=1)

# for dropped_columns in range(21 + 68, 33 + 68):
#     df_classification = df_classification.drop(dropped_columns, axis=1)

for dropped_columns in range(34, 136):
    df_clusterize = df_clusterize.drop(dropped_columns, axis=1)

# df_clusterize

df_clusterize['transcript'] = ""

In [10]:
logs = {}
for activity in activities:
    filename = f'../web_recordings/{activity}/{activity}_transcript_log.txt'
    data = open(filename)
    for line in data:
        transcript_line = line.split(' ')
        transcript_filename = f'{transcript_line[0]}_{transcript_line[1]}_{transcript_line[2][:-1]}.wav'
        # print(transcript_line[3][:-1])
        df_clusterize['transcript'].loc[transcript_filename] = transcript_line[3][:-1]


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_clusterize['transcript'].loc[transcript_filename] = transcript_line[3][:-1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-

In [11]:
df_clusterize

Unnamed: 0,1,2,3,4,5,8,9,10,11,12,13,14,15,16,17,18,19,20,33,transcript
Dyspnea_exhale_3.wav,0.012806,3.317673,0.002670,0.029585,0.000113,-45.334891,0.490578,-1.832355,-1.054599,1.278921,0.150162,-0.494087,0.757571,-0.928969,0.152278,-0.180459,0.947806,-0.989287,0.046064,щщффтфффффч
Dyspnea_exhale_2.wav,0.000804,3.317594,0.007379,0.062256,0.000188,-46.882612,-0.084901,-1.458478,-1.207348,0.871057,0.167427,-0.354102,0.632070,-0.892782,0.172083,-0.113934,0.632333,-0.737913,0.046063,фттфтттфтфф
Dyspnea_exhale_1.wav,0.010515,3.317643,0.002303,0.028619,0.000002,-46.813120,-0.120158,-2.209376,-0.860556,1.260459,-0.182079,-0.293283,0.283785,-0.577972,-0.182297,0.079415,0.546806,-0.700873,0.046064,ффффтфтфффшщ
Dyspnea_exhale_5.wav,0.012740,3.317729,0.002582,0.029792,0.000094,-47.010755,-0.054928,-1.432415,-1.125739,1.349913,0.250898,-0.540844,0.771387,-0.715710,0.600385,-0.102779,1.103075,-0.892812,0.046064,щщщфитфффффщ
Dyspnea_exhale_4.wav,0.000469,3.317633,0.006905,0.056993,0.001146,-48.923292,-0.788085,-1.497788,-0.691126,1.498575,-0.004970,-1.031787,0.462356,-0.861208,0.154169,-0.440761,1.232708,-1.158310,0.046059,щщчффффттффч
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Walking_exhale_10.wav,0.002698,3.317677,0.003698,0.039918,0.000023,-46.017261,0.186515,-2.056355,-1.170519,1.051926,0.230227,-0.392596,0.314208,-0.438443,0.291220,-0.029049,0.770478,-0.469380,0.046064,фффтфттффффффщ
Walking_inhale_9.wav,0.008702,3.317718,0.002517,0.030538,0.000007,-46.547248,0.684925,-1.229023,-0.283696,1.018980,-0.341183,0.121417,1.037860,-0.634573,-0.166995,-0.066688,0.899792,-0.708631,0.046064,фффффтттфффффтффщ
Walking_inhale_8.wav,0.011694,3.317635,0.002565,0.029961,0.000010,-43.101846,1.685298,-1.252784,-0.589215,0.707981,0.639632,-0.273160,0.345471,-0.703134,0.105194,0.233209,0.748748,-0.685037,0.046064,фщттттфтттффф
Walking_exhale_9.wav,0.006850,3.317664,0.003172,0.035909,0.000034,-45.430178,0.337116,-1.715689,-1.243755,0.807236,0.448597,-0.203917,0.411391,-0.769462,0.155357,0.138954,0.952452,-0.682581,0.046064,фффффттттффффтфщ


In [12]:
from scipy.cluster.hierarchy import ward, dendrogram, leaves_list
from scipy.spatial.distance import pdist

df_ward = df_clusterize.transpose()
df_ward.fillna(0, inplace=True)
feature_matrix = df_ward.T.values # n dimensional, m observations.
Z = ward(pdist(feature_matrix))
ll = list(leaves_list(Z))
breathe_moment = df_ward.columns[ll]
list_of_dir_wavs = [x for x in df_ward.columns]
dn = dendrogram(Z, labels = list_of_dir_wavs, orientation = 'left')
plt.savefig('dendrogram_ward_transcript.png')
plt.close()

ValueError: Unsupported dtype object

In [14]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.sparse import hstack

vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5))
tfidf_features = vectorizer.fit_transform(df_clusterize['transcript'])

combined_features = hstack([tfidf_features])

kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(combined_features)
df_clusterize['cluster'] = clusters

df_clusterize

Unnamed: 0,1,2,3,4,5,8,9,10,11,12,...,14,15,16,17,18,19,20,33,transcript,cluster
Dyspnea_exhale_3.wav,0.012806,3.317673,0.002670,0.029585,0.000113,-45.334891,0.490578,-1.832355,-1.054599,1.278921,...,-0.494087,0.757571,-0.928969,0.152278,-0.180459,0.947806,-0.989287,0.046064,щщффтфффффч,1
Dyspnea_exhale_2.wav,0.000804,3.317594,0.007379,0.062256,0.000188,-46.882612,-0.084901,-1.458478,-1.207348,0.871057,...,-0.354102,0.632070,-0.892782,0.172083,-0.113934,0.632333,-0.737913,0.046063,фттфтттфтфф,0
Dyspnea_exhale_1.wav,0.010515,3.317643,0.002303,0.028619,0.000002,-46.813120,-0.120158,-2.209376,-0.860556,1.260459,...,-0.293283,0.283785,-0.577972,-0.182297,0.079415,0.546806,-0.700873,0.046064,ффффтфтфффшщ,1
Dyspnea_exhale_5.wav,0.012740,3.317729,0.002582,0.029792,0.000094,-47.010755,-0.054928,-1.432415,-1.125739,1.349913,...,-0.540844,0.771387,-0.715710,0.600385,-0.102779,1.103075,-0.892812,0.046064,щщщфитфффффщ,1
Dyspnea_exhale_4.wav,0.000469,3.317633,0.006905,0.056993,0.001146,-48.923292,-0.788085,-1.497788,-0.691126,1.498575,...,-1.031787,0.462356,-0.861208,0.154169,-0.440761,1.232708,-1.158310,0.046059,щщчффффттффч,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Walking_exhale_10.wav,0.002698,3.317677,0.003698,0.039918,0.000023,-46.017261,0.186515,-2.056355,-1.170519,1.051926,...,-0.392596,0.314208,-0.438443,0.291220,-0.029049,0.770478,-0.469380,0.046064,фффтфттффффффщ,1
Walking_inhale_9.wav,0.008702,3.317718,0.002517,0.030538,0.000007,-46.547248,0.684925,-1.229023,-0.283696,1.018980,...,0.121417,1.037860,-0.634573,-0.166995,-0.066688,0.899792,-0.708631,0.046064,фффффтттфффффтффщ,1
Walking_inhale_8.wav,0.011694,3.317635,0.002565,0.029961,0.000010,-43.101846,1.685298,-1.252784,-0.589215,0.707981,...,-0.273160,0.345471,-0.703134,0.105194,0.233209,0.748748,-0.685037,0.046064,фщттттфтттффф,0
Walking_exhale_9.wav,0.006850,3.317664,0.003172,0.035909,0.000034,-45.430178,0.337116,-1.715689,-1.243755,0.807236,...,-0.203917,0.411391,-0.769462,0.155357,0.138954,0.952452,-0.682581,0.046064,фффффттттффффтфщ,1


In [17]:
hash_vectorizer = HashingVectorizer(analyzer='char_wb', ngram_range=(3, 5), n_features=50)
hashed_features = hash_vectorizer.fit_transform(df_clusterize['transcript'])

combined_features_hash = hstack([hashed_features]) # 

kmeans_hash = KMeans(n_clusters=2, random_state=42)
clusters_hash = kmeans_hash.fit_predict(combined_features_hash)
df_clusterize['cluster_hash'] = clusters_hash

df_clusterize

Unnamed: 0,1,2,3,4,5,8,9,10,11,12,...,15,16,17,18,19,20,33,transcript,cluster,cluster_hash
Dyspnea_exhale_3.wav,0.012806,3.317673,0.002670,0.029585,0.000113,-45.334891,0.490578,-1.832355,-1.054599,1.278921,...,0.757571,-0.928969,0.152278,-0.180459,0.947806,-0.989287,0.046064,щщффтфффффч,1,1
Dyspnea_exhale_2.wav,0.000804,3.317594,0.007379,0.062256,0.000188,-46.882612,-0.084901,-1.458478,-1.207348,0.871057,...,0.632070,-0.892782,0.172083,-0.113934,0.632333,-0.737913,0.046063,фттфтттфтфф,0,0
Dyspnea_exhale_1.wav,0.010515,3.317643,0.002303,0.028619,0.000002,-46.813120,-0.120158,-2.209376,-0.860556,1.260459,...,0.283785,-0.577972,-0.182297,0.079415,0.546806,-0.700873,0.046064,ффффтфтфффшщ,1,1
Dyspnea_exhale_5.wav,0.012740,3.317729,0.002582,0.029792,0.000094,-47.010755,-0.054928,-1.432415,-1.125739,1.349913,...,0.771387,-0.715710,0.600385,-0.102779,1.103075,-0.892812,0.046064,щщщфитфффффщ,1,1
Dyspnea_exhale_4.wav,0.000469,3.317633,0.006905,0.056993,0.001146,-48.923292,-0.788085,-1.497788,-0.691126,1.498575,...,0.462356,-0.861208,0.154169,-0.440761,1.232708,-1.158310,0.046059,щщчффффттффч,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Walking_exhale_10.wav,0.002698,3.317677,0.003698,0.039918,0.000023,-46.017261,0.186515,-2.056355,-1.170519,1.051926,...,0.314208,-0.438443,0.291220,-0.029049,0.770478,-0.469380,0.046064,фффтфттффффффщ,1,1
Walking_inhale_9.wav,0.008702,3.317718,0.002517,0.030538,0.000007,-46.547248,0.684925,-1.229023,-0.283696,1.018980,...,1.037860,-0.634573,-0.166995,-0.066688,0.899792,-0.708631,0.046064,фффффтттфффффтффщ,1,1
Walking_inhale_8.wav,0.011694,3.317635,0.002565,0.029961,0.000010,-43.101846,1.685298,-1.252784,-0.589215,0.707981,...,0.345471,-0.703134,0.105194,0.233209,0.748748,-0.685037,0.046064,фщттттфтттффф,0,0
Walking_exhale_9.wav,0.006850,3.317664,0.003172,0.035909,0.000034,-45.430178,0.337116,-1.715689,-1.243755,0.807236,...,0.411391,-0.769462,0.155357,0.138954,0.952452,-0.682581,0.046064,фффффттттффффтфщ,1,1


In [18]:
df_clusterize[['cluster', 'cluster_hash']].to_csv("df_clusterize.csv")

In [19]:
import joblib

joblib.dump(kmeans_hash, "model_transcript_breath.pkl")

['model_transcript_breath.pkl']