In [5]:
import pandas as pd
import os
import plotting
import importlib
importlib.reload(plotting)
import math
from typing import List

In [6]:
def load_data(data_src):
    depth_resist_curve_df_list = [] # used for plotting
    filename_list = []
    for filename in os.listdir(data_src):
        df = pd.read_csv(f"{data_src}/{filename}")
        depth_resist_curve_df_list.append(df)
        filename_list.append(filename)
    return depth_resist_curve_df_list, filename_list

data_src = "data/cleaned_data"
depth_resist_curve_df_list, filename_list = load_data(data_src)
filename_to_depth_resist = dict(zip(filename_list, depth_resist_curve_df_list))

In [7]:
data_features = pd.read_csv("data/features.csv")
clustering_features = plotting.extract_numerical_features(data_features)

ylabel_name = 'marions_ylabels_2'
marions_labeled_data = data_features[data_features[ylabel_name].notna()].copy() # removes NaN's which correspond to non-labled data

In [10]:
def find_labels_centroids(labeled_data: pd.DataFrame, label_to_cluster_num:dict[str,int], ylabel_name:str) -> pd.DataFrame:
    labeled_data = labeled_data.copy()
    numerical_data = plotting.extract_numerical_features(labeled_data)
    label_centroids = []
    for label, label_num in label_to_cluster_num.items():
        label_data = numerical_data[labeled_data[ylabel_name] == label]
        label_centroid = label_data.mean(axis=0).tolist() # drop cols for mean calculation
        label_centroid.extend([label, label_num]) # append these onto the end
        label_centroids.append(label_centroid)
    return_cols = numerical_data.columns.tolist()
    return_cols.extend([ylabel_name, f'{ylabel_name}_nums'])
    return pd.DataFrame(data=label_centroids, columns=return_cols)

label_to_cluster_num = {'ES-B':0, 'ES':1, 'F':2, 'ES-BW':3, 'ES-D':4, 'LS':5, 'ES-S':6, 'LS/F':7, 'ES-DB':8}
label_centroids = find_labels_centroids(marions_labeled_data, label_to_cluster_num, ylabel_name)
print(f'label_centroids: {label_centroids[:3]}')

label_centroids:    max_depth  max_resistance  num_peaks  largest_force_drop  curve_shape  \
0   0.106659       -0.524066   0.864748            1.048346     0.267526   
1  -0.939244        1.182363  -0.063796            0.069559     0.306016   
2  -0.697985        0.917715  -0.216149           -0.415368     0.044371   

  marions_ylabels_2  marions_ylabels_2_nums  
0              ES-B                       0  
1                ES                       1  
2                 F                       2  


In [12]:
def euclidean_distance(x: pd.Series, y: pd.Series) -> float:
    return math.sqrt(((y - x) ** 2).sum())

def find_closest_centroid(unlabeled_data_df:pd.DataFrame, marions_centroids:pd.DataFrame, ylabel_name) -> pd.DataFrame:
    unlabeled_data_df = unlabeled_data_df.copy()
    unlabeled_data_df = plotting.extract_numerical_features(unlabeled_data_df)
    
    pseudo_label_list = []
    pseudo_label_num_list = []
    unlabeled_data_df = unlabeled_data_df.copy()
    for i, depth_res_curve in unlabeled_data_df.iterrows():
        min_distance = float('inf')
        min_distance_label = ''
        min_distance_label_num = 0
        for j, marions_label_centroid in marions_centroids.iterrows():
            curve_to_label_mean_distance = euclidean_distance(depth_res_curve, marions_label_centroid)
            if curve_to_label_mean_distance < min_distance:
                min_distance = curve_to_label_mean_distance
                min_distance_label = marions_label_centroid[ylabel_name]
                min_distance_label_num = j
        pseudo_label_list.append(min_distance_label)
        pseudo_label_num_list.append(min_distance_label_num)
    unlabeled_data_df[f'pseudo_{ylabel_name}'] = pseudo_label_list
    unlabeled_data_df[f'pseudo_{ylabel_name}_num'] = pseudo_label_num_list
    return unlabeled_data_df

pseudo_labeled_data = find_closest_centroid(data_features, label_centroids, ylabel_name)
print(f'pseudo_labeled_data: \n{pseudo_labeled_data[:3]}')

pseudo_labeled_data: 
   max_depth  max_resistance  num_peaks  largest_force_drop  curve_shape  \
0  -0.406696        0.739042  -0.696539           -0.667883     0.729832   
1  -0.422131        0.660764  -0.696539            0.422346    -0.659030   
2   0.562019       -0.214852  -0.696539            0.019005    -0.863595   

  pseudo_marions_ylabels_2  pseudo_marions_ylabels_2_num  
0                        F                             2  
1                        F                             2  
2                       LS                             5  


In [13]:
def find_changed_label_curves(labeled_data:pd.DataFrame, pseudo_labeled_data:pd.DataFrame, ylabel_name:str):
    ylabel_indicies = labeled_data.index # indicies of labeled data points
    pseudo_labels_of_labeled_data = pseudo_labeled_data.loc[ylabel_indicies]
    diff_mask = (labeled_data[ylabel_name] != pseudo_labels_of_labeled_data[f'pseudo_{ylabel_name}'])
    changed_label_indicies = labeled_data[diff_mask].index.tolist()
    return changed_label_indicies

changed_ylabel_indicies = find_changed_label_curves(marions_labeled_data, pseudo_labeled_data, ylabel_name)
print(f'changed_ylabel_indicies: {changed_ylabel_indicies}')

changed_ylabel_indicies: [0, 8, 12, 13, 15, 16, 19, 22, 26, 27, 33, 37, 41, 43, 44, 47, 48, 49, 54, 55, 62, 66, 71, 72, 74, 77]
