In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import csv
import io
import os
import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
import PIL
from PIL import Image
import librosa

In [2]:
temp_path = r"C:./dataset/spectrograms"
class_name = next(os.walk(temp_path))[1]

In [3]:
class_name

['.ipynb_checkpoints',
 'bellypain',
 'burping',
 'discomfort',
 'hungry',
 'tired',
 'unlabeled']

In [4]:
labeled_fold_path = [os.path.join(temp_path, name) for name in class_name[1:6]]
labeled_fold_path

['C:./dataset/spectrograms\\bellypain',
 'C:./dataset/spectrograms\\burping',
 'C:./dataset/spectrograms\\discomfort',
 'C:./dataset/spectrograms\\hungry',
 'C:./dataset/spectrograms\\tired']

In [5]:
lab_img_path = [glob.glob(os.path.join(folder,'*.png')) for folder in labeled_fold_path]
len(lab_img_path[0])
print(lab_img_path)

[['C:./dataset/spectrograms\\bellypain\\bellypain_spectrogram_1.png', 'C:./dataset/spectrograms\\bellypain\\bellypain_spectrogram_10.png', 'C:./dataset/spectrograms\\bellypain\\bellypain_spectrogram_11.png', 'C:./dataset/spectrograms\\bellypain\\bellypain_spectrogram_12.png', 'C:./dataset/spectrograms\\bellypain\\bellypain_spectrogram_13.png', 'C:./dataset/spectrograms\\bellypain\\bellypain_spectrogram_14.png', 'C:./dataset/spectrograms\\bellypain\\bellypain_spectrogram_15.png', 'C:./dataset/spectrograms\\bellypain\\bellypain_spectrogram_16.png', 'C:./dataset/spectrograms\\bellypain\\bellypain_spectrogram_2.png', 'C:./dataset/spectrograms\\bellypain\\bellypain_spectrogram_3.png', 'C:./dataset/spectrograms\\bellypain\\bellypain_spectrogram_4.png', 'C:./dataset/spectrograms\\bellypain\\bellypain_spectrogram_5.png', 'C:./dataset/spectrograms\\bellypain\\bellypain_spectrogram_6.png', 'C:./dataset/spectrograms\\bellypain\\bellypain_spectrogram_7.png', 'C:./dataset/spectrograms\\bellypain\\b

In [6]:
unlab_foldpath = r"C:./dataset/spectrograms/unlabeled"
unlab_im_path = glob.glob(os.path.join(unlab_foldpath, '*.png'))
len(unlab_im_path)

18190

In [7]:
def resize_and_normalize_image(image_path, new_size):
    with Image.open(image_path) as img:
        resized_image = img.resize(new_size).convert('RGB')
        image_array = np.array(resized_image)
        normalized_image = image_array / 255.0

    return normalized_image

In [8]:
# bellypain resize
bpain_img = [resize_and_normalize_image(path, (200, 100)) for path in lab_img_path[0]]
bpain_img_arr = np.array(bpain_img)

In [9]:
# burp resize
burp_img = [resize_and_normalize_image(path, (200, 100)) for path in lab_img_path[1]]
burp_img_arr = np.array(burp_img)

In [10]:
# discomfort resize
discomf_img = [resize_and_normalize_image(path, (200, 100)) for path in lab_img_path[2]]
discomf_img_arr = np.array(discomf_img)

In [11]:
# hungry resize
hgry_img = [resize_and_normalize_image(path, (200, 100)) for path in lab_img_path[3]]
hgry_img_arr = np.array(hgry_img)

In [12]:
# tired resize
tired_img = [resize_and_normalize_image(path, (200, 100)) for path in lab_img_path[4]]
tired_img_arr = np.array(tired_img)

In [13]:
# unlabeled resize
unlab_img = [resize_and_normalize_image(path, (200, 100)) for path in unlab_im_path]
unlab_img_arr = np.array(unlab_img)

In [14]:
print(bpain_img_arr.shape)
print(burp_img_arr.shape)
print(discomf_img_arr.shape)
print(hgry_img_arr.shape)
print(tired_img_arr.shape)
print(unlab_img_arr.shape)

(16, 100, 200, 3)
(8, 100, 200, 3)
(27, 100, 200, 3)
(382, 100, 200, 3)
(24, 100, 200, 3)
(18190, 100, 200, 3)


In [15]:
from tensorflow.keras.applications.vgg16 import VGG16

model = VGG16(include_top=False, weights='imagenet', pooling='avg')

# Function to extract features from a 4D image array
def extract_features(img_array):
    # Ensure the image array is 4D (batch_size, height, width, channels)
    assert img_array.ndim == 4, "Input must be a 4D array"

    # Extract features
    features = model.predict(img_array)
    return features

In [16]:
# labeled data only
bpain_features = extract_features(bpain_img_arr)
burp_feature = extract_features(burp_img_arr)
discomf_feature = extract_features(discomf_img_arr)
hgry_feature = extract_features(hgry_img_arr)
tired_feature = extract_features(tired_img_arr)



In [17]:
# unlabeled data feature extract
unlab_features = extract_features(unlab_img_arr)



In [18]:
from sklearn.cluster import KMeans

features_by_class = {
    'bellypain': bpain_features,
    'burp': burp_feature,
    'discomfort': discomf_feature,
    'hungry': hgry_feature,
    'tired': tired_feature
}

centroids = {}

for class_label, features in features_by_class.items():
    kmeans = KMeans(n_clusters=1, random_state=0).fit(features)
    centroids[class_label] = kmeans.cluster_centers_[0]

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [19]:
from scipy.spatial import distance

# Assuming centroids is a dictionary with class labels as keys and centroid vectors as values
# Assuming unlabeled_features is a list or array of feature vectors for unlabeled data

distances = {class_label: [] for class_label in centroids.keys()}

for i, feature_vector in enumerate(unlab_features):
    for class_label, centroid in centroids.items():
        dist = distance.euclidean(feature_vector, centroid)
        distances[class_label].append((i, dist))

In [20]:
similarities = {class_label: [] for class_label in centroids.keys()}

for i, feature_vector in enumerate(unlab_features):
    for class_label, centroid in centroids.items():
        similarity = 1 - distance.cosine(feature_vector, centroid)
        similarities[class_label].append((i, similarity))

In [21]:
import pandas as pd

# Initialize an empty list to store the data
data = []

# Iterate over each unlabeled data point
for i in range(len(unlab_features)):
    row = {}
    for class_label in centroids.keys():
        # Extracting distance and similarity for each class label
        distance = next(item for item in distances[class_label] if item[0] == i)[1]
        similarity = next(item for item in similarities[class_label] if item[0] == i)[1]

        # Creating column names and adding them to the row
        row[f'Distance_to_{class_label}'] = distance
        row[f'Similarity_to_{class_label}'] = similarity

    # Add the row to the data list
    data.append(row)

# Create the DataFrame
feat_sim_df = pd.DataFrame(data)

In [22]:
feat_sim_df

Unnamed: 0,Distance_to_bellypain,Similarity_to_bellypain,Distance_to_burp,Similarity_to_burp,Distance_to_discomfort,Similarity_to_discomfort,Distance_to_hungry,Similarity_to_hungry,Distance_to_tired,Similarity_to_tired
0,3.663705,0.950008,3.798076,0.946446,3.710215,0.949197,3.588126,0.952264,3.590718,0.952092
1,4.095547,0.938271,4.240916,0.933983,4.190486,0.935860,4.065168,0.939407,4.040025,0.940049
2,4.086757,0.937741,4.200683,0.934433,4.178236,0.935513,4.088264,0.937975,4.038696,0.939339
3,3.986636,0.940542,4.164264,0.935346,4.139020,0.936523,3.999614,0.940440,3.952981,0.941688
4,3.446669,0.956050,3.588214,0.952498,3.535270,0.954132,3.432418,0.956582,3.386961,0.957645
...,...,...,...,...,...,...,...,...,...,...
18185,3.472385,0.955549,3.684698,0.950074,3.683007,0.950366,3.524577,0.954368,3.447475,0.956267
18186,3.977731,0.941649,4.089080,0.938502,4.040771,0.940248,3.958749,0.942422,3.908700,0.943768
18187,3.794546,0.946202,3.959738,0.941613,3.932314,0.942774,3.819585,0.945747,3.761813,0.947257
18188,4.119009,0.936315,4.248916,0.932475,4.203336,0.934344,4.132038,0.936231,4.069079,0.938013


In [23]:
dist_only_df = feat_sim_df[['Distance_to_bellypain', 'Distance_to_burp', 'Distance_to_discomfort', 'Distance_to_hungry', 'Distance_to_tired']]
dist_only_df

Unnamed: 0,Distance_to_bellypain,Distance_to_burp,Distance_to_discomfort,Distance_to_hungry,Distance_to_tired
0,3.663705,3.798076,3.710215,3.588126,3.590718
1,4.095547,4.240916,4.190486,4.065168,4.040025
2,4.086757,4.200683,4.178236,4.088264,4.038696
3,3.986636,4.164264,4.139020,3.999614,3.952981
4,3.446669,3.588214,3.535270,3.432418,3.386961
...,...,...,...,...,...
18185,3.472385,3.684698,3.683007,3.524577,3.447475
18186,3.977731,4.089080,4.040771,3.958749,3.908700
18187,3.794546,3.959738,3.932314,3.819585,3.761813
18188,4.119009,4.248916,4.203336,4.132038,4.069079


In [24]:
sim_only_df = feat_sim_df[['Similarity_to_bellypain', 'Similarity_to_burp', 'Similarity_to_discomfort', 'Similarity_to_hungry', 'Similarity_to_tired']]
sim_only_df

Unnamed: 0,Similarity_to_bellypain,Similarity_to_burp,Similarity_to_discomfort,Similarity_to_hungry,Similarity_to_tired
0,0.950008,0.946446,0.949197,0.952264,0.952092
1,0.938271,0.933983,0.935860,0.939407,0.940049
2,0.937741,0.934433,0.935513,0.937975,0.939339
3,0.940542,0.935346,0.936523,0.940440,0.941688
4,0.956050,0.952498,0.954132,0.956582,0.957645
...,...,...,...,...,...
18185,0.955549,0.950074,0.950366,0.954368,0.956267
18186,0.941649,0.938502,0.940248,0.942422,0.943768
18187,0.946202,0.941613,0.942774,0.945747,0.947257
18188,0.936315,0.932475,0.934344,0.936231,0.938013


In [25]:
dist_only_df.describe()

Unnamed: 0,Distance_to_bellypain,Distance_to_burp,Distance_to_discomfort,Distance_to_hungry,Distance_to_tired
count,18190.0,18190.0,18190.0,18190.0,18190.0
mean,3.798452,3.937073,3.913924,3.796752,3.759854
std,0.406094,0.411557,0.409031,0.412606,0.402381
min,1.726593,1.900339,1.771504,1.785135,1.738932
25%,3.546027,3.679101,3.661476,3.539818,3.507643
50%,3.818288,3.968929,3.954596,3.830903,3.789957
75%,4.066256,4.20694,4.188848,4.072177,4.026163
max,5.549594,5.708679,5.531877,5.452256,5.445051


In [26]:
sim_only_df.describe()

Unnamed: 0,Similarity_to_bellypain,Similarity_to_burp,Similarity_to_discomfort,Similarity_to_hungry,Similarity_to_tired
count,18190.0,18190.0,18190.0,18190.0,18190.0
mean,0.945736,0.941907,0.942921,0.945998,0.946951
std,0.011156,0.011669,0.011477,0.011295,0.010931
min,0.897705,0.893844,0.897951,0.899397,0.901254
25%,0.938513,0.934272,0.935185,0.938525,0.939793
50%,0.945553,0.941458,0.942249,0.945517,0.946548
75%,0.9532,0.949682,0.950526,0.953562,0.954255
max,0.988851,0.986554,0.988441,0.988164,0.988731


In [27]:
def inv_norm(column):
    return 1 - ((column - column.min()) / (column.max() - column.min()))

inv_norm_df = dist_only_df.apply(inv_norm)
inv_norm_df

Unnamed: 0,Distance_to_bellypain,Distance_to_burp,Distance_to_discomfort,Distance_to_hungry,Distance_to_tired
0,0.493301,0.501689,0.484436,0.508336,0.500344
1,0.380342,0.385408,0.356718,0.378250,0.379110
2,0.382641,0.395972,0.359975,0.371952,0.379468
3,0.408830,0.405535,0.370404,0.396126,0.402596
4,0.550072,0.556795,0.530960,0.550797,0.555322
...,...,...,...,...,...
18185,0.543345,0.531460,0.491672,0.525665,0.538994
18186,0.411159,0.425277,0.396532,0.407270,0.414544
18187,0.459076,0.459240,0.425373,0.445219,0.454178
18188,0.374205,0.383307,0.353300,0.360015,0.371270


In [28]:
# row_variances = inv_only_df.var(axis=1)
# inv_norm_df['row_variance'] = row_variances

In [29]:
row_variances = sim_only_df.var(axis=1)
sim_only_df['row_variance'] = row_variances

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sim_only_df['row_variance'] = row_variances


In [30]:
sim_only_df

Unnamed: 0,Similarity_to_bellypain,Similarity_to_burp,Similarity_to_discomfort,Similarity_to_hungry,Similarity_to_tired,row_variance
0,0.950008,0.946446,0.949197,0.952264,0.952092,0.000006
1,0.938271,0.933983,0.935860,0.939407,0.940049,0.000006
2,0.937741,0.934433,0.935513,0.937975,0.939339,0.000004
3,0.940542,0.935346,0.936523,0.940440,0.941688,0.000008
4,0.956050,0.952498,0.954132,0.956582,0.957645,0.000004
...,...,...,...,...,...,...
18185,0.955549,0.950074,0.950366,0.954368,0.956267,0.000009
18186,0.941649,0.938502,0.940248,0.942422,0.943768,0.000004
18187,0.946202,0.941613,0.942774,0.945747,0.947257,0.000006
18188,0.936315,0.932475,0.934344,0.936231,0.938013,0.000005


In [31]:
inv_norm_df

Unnamed: 0,Distance_to_bellypain,Distance_to_burp,Distance_to_discomfort,Distance_to_hungry,Distance_to_tired
0,0.493301,0.501689,0.484436,0.508336,0.500344
1,0.380342,0.385408,0.356718,0.378250,0.379110
2,0.382641,0.395972,0.359975,0.371952,0.379468
3,0.408830,0.405535,0.370404,0.396126,0.402596
4,0.550072,0.556795,0.530960,0.550797,0.555322
...,...,...,...,...,...
18185,0.543345,0.531460,0.491672,0.525665,0.538994
18186,0.411159,0.425277,0.396532,0.407270,0.414544
18187,0.459076,0.459240,0.425373,0.445219,0.454178
18188,0.374205,0.383307,0.353300,0.360015,0.371270


In [32]:
sim_only_df.describe()

Unnamed: 0,Similarity_to_bellypain,Similarity_to_burp,Similarity_to_discomfort,Similarity_to_hungry,Similarity_to_tired,row_variance
count,18190.0,18190.0,18190.0,18190.0,18190.0,18190.0
mean,0.945736,0.941907,0.942921,0.945998,0.946951,5.455249e-06
std,0.011156,0.011669,0.011477,0.011295,0.010931,1.964912e-06
min,0.897705,0.893844,0.897951,0.899397,0.901254,2.37131e-07
25%,0.938513,0.934272,0.935185,0.938525,0.939793,4.092423e-06
50%,0.945553,0.941458,0.942249,0.945517,0.946548,5.494172e-06
75%,0.9532,0.949682,0.950526,0.953562,0.954255,6.818504e-06
max,0.988851,0.986554,0.988441,0.988164,0.988731,1.594601e-05


In [33]:
inv_norm_df.describe()

Unnamed: 0,Distance_to_bellypain,Distance_to_burp,Distance_to_discomfort,Distance_to_hungry,Distance_to_tired
count,18190.0,18190.0,18190.0,18190.0,18190.0
mean,0.458054,0.465191,0.430264,0.451445,0.454707
std,0.106224,0.108067,0.108774,0.112515,0.108572
min,0.0,0.0,0.0,0.0,0.0
25%,0.388004,0.394329,0.357153,0.376339,0.38285
50%,0.452866,0.456826,0.419448,0.442132,0.446584
75%,0.524082,0.53293,0.497398,0.521509,0.522759
max,1.0,1.0,1.0,1.0,1.0


In [34]:
sim_final_df = sim_only_df.drop(columns = ['row_variance'], axis = 1)
dist_final_df = inv_norm_df.drop(columns = ['row_variance'], axis = 1)

KeyError: "['row_variance'] not found in axis"

In [None]:
sim_final_df.to_csv('cos_sim_cnnver.csv')
dist_final_df.to_csv('dist_cnnver.csv')