In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import librosa
import glob
from itertools import chain


Initial files

In [2]:
activities = ['Dyspnea', 'Running', 'Sitting', 'Squats', 'Standing', 'Walking']
wavs = {}
for activity in activities:
    wavs[activity] = glob.glob(f'../web_recordings/{activity}/audio/*.wav')

wavs_list = list(chain(*wavs.values()))
wavs_list

['../web_recordings/Dyspnea/audio/Dyspnea_exhale_3.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_exhale_2.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_exhale_1.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_exhale_5.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_exhale_4.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_exhale_6.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_exhale_7.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_inhale_6.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_inhale_7.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_inhale_5.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_inhale_4.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_inhale_1.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_inhale_3.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_inhale_2.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_inhale_9.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_inhale_8.wav',
 '../web_recordings/Dyspnea/audio/Dyspnea_exhale_9.wav',
 '../web_recordings/Dyspnea/aud

In [9]:
df = pd.DataFrame(wavs_list, columns=['file_name'])
df["breath_part"] = df["file_name"].str.contains("inhale").replace({True: "inhale", False: "exhale"})
df

Unnamed: 0,file_name,breath_part
0,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale
1,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale
2,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale
3,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale
4,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale
...,...,...
115,../web_recordings/Walking/audio/Walking_exhale...,exhale
116,../web_recordings/Walking/audio/Walking_inhale...,inhale
117,../web_recordings/Walking/audio/Walking_inhale...,inhale
118,../web_recordings/Walking/audio/Walking_exhale...,exhale


In [12]:
from tools import audio_fingerprinting

df["transcript"] = df["file_name"].apply(lambda x: audio_fingerprinting.translate_breath(x))
df



Unnamed: 0,file_name,breath_part,transcript
0,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale,щглшлсмчфщщ
1,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale,щтссшщзчффк
2,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale,зфлфщсзфсзшс
3,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale,чгщфшзфщссзч
4,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale,ччгбсцфзлззч
...,...,...,...
115,../web_recordings/Walking/audio/Walking_exhale...,exhale,щзлшшлфшсзфшлцз
116,../web_recordings/Walking/audio/Walking_inhale...,inhale,зщсшфтффззлщссфщщ
117,../web_recordings/Walking/audio/Walking_inhale...,inhale,зчзтнлтщфрчсщ
118,../web_recordings/Walking/audio/Walking_exhale...,exhale,фшщшщлшзбфтсфсшч


In [14]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.sparse import hstack

vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5))
tfidf_features = vectorizer.fit_transform(df['transcript'])

combined_features = hstack([tfidf_features])

kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(combined_features)
df['cluster'] = clusters

df

Unnamed: 0,file_name,breath_part,transcript,cluster_hash,cluster
0,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale,щглшлсмчфщщ,0,1
1,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale,щтссшщзчффк,0,1
2,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale,зфлфщсзфсзшс,0,1
3,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale,чгщфшзфщссзч,0,1
4,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale,ччгбсцфзлззч,0,1
...,...,...,...,...,...
115,../web_recordings/Walking/audio/Walking_exhale...,exhale,щзлшшлфшсзфшлцз,1,1
116,../web_recordings/Walking/audio/Walking_inhale...,inhale,зщсшфтффззлщссфщщ,0,1
117,../web_recordings/Walking/audio/Walking_inhale...,inhale,зчзтнлтщфрчсщ,0,1
118,../web_recordings/Walking/audio/Walking_exhale...,exhale,фшщшщлшзбфтсфсшч,0,1


In [25]:
from sklearn.cluster import MiniBatchKMeans

hash_vectorizer = HashingVectorizer(analyzer='char_wb', ngram_range=(3, 5), n_features=50)
hashed_features = hash_vectorizer.fit_transform(df['transcript'])

# combined_features_hash = hstack([hashed_features]) # 

kmeans_hash = MiniBatchKMeans(n_clusters=2, random_state=42)
clusters_hash = kmeans_hash.fit_predict(hashed_features)
df['cluster_hash'] = clusters_hash

df

Unnamed: 0,file_name,breath_part,transcript,cluster_hash,cluster
0,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale,щглшлсмчфщщ,0,1
1,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale,щтссшщзчффк,1,1
2,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale,зфлфщсзфсзшс,0,1
3,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale,чгщфшзфщссзч,1,1
4,../web_recordings/Dyspnea/audio/Dyspnea_exhale...,exhale,ччгбсцфзлззч,0,1
...,...,...,...,...,...
115,../web_recordings/Walking/audio/Walking_exhale...,exhale,щзлшшлфшсзфшлцз,1,1
116,../web_recordings/Walking/audio/Walking_inhale...,inhale,зщсшфтффззлщссфщщ,0,1
117,../web_recordings/Walking/audio/Walking_inhale...,inhale,зчзтнлтщфрчсщ,0,1
118,../web_recordings/Walking/audio/Walking_exhale...,exhale,фшщшщлшзбфтсфсшч,0,1


In [17]:
df[['breath_part', 'cluster', 'cluster_hash']].to_csv("df_clusterize.csv")

In [26]:
import joblib

joblib.dump(kmeans_hash, "model_transcript_fingerprint.pkl")

['model_transcript_fingerprint.pkl']