In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import random
import matplotlib.pyplot as plt
import seaborn as sns
from src.models.utils import cnc_add_y_label_binary

%load_ext autoreload
%autoreload 2

In [2]:
path_data_dir = Path.cwd().parent.parent / "data"
path_processed_dir = Path(Path.cwd().parent.parent / 'data/processed/cnc/cnc_features_comp')
print(path_processed_dir)

/home/tim/Documents/feat-store/data/processed/cnc/cnc_features_comp


In [13]:


def load_cnc_features(
    path_data_dir, path_processed_dir, feat_file_name, label_file_name
):
    """
    This function returns a dataframe with the appropriate meta-label columns and the label column (y).

    Meta-label columns are:
    - case_tool_54: the case number for tool 54
    - unix_date: the unix date at the time of the cut
    - tool_no: the tool number
    - index_no: the index number of the cut

    """
    df = pd.read_csv(
        path_processed_dir / feat_file_name,
    )
    df["unix_date"] = df["id"].apply(lambda x: int(x.split("_")[0]))
    df["tool_no"] = df["id"].apply(lambda x: int(x.split("_")[-2]))
    df["index_no"] = df["id"].apply(lambda x: int(x.split("_")[-1]))

    df_labels = pd.read_csv(path_data_dir / "processed/cnc" / label_file_name)

    df = cnc_add_y_label_binary(df, df_labels, col_list_case=["case_tool_54"])
    df = df.dropna(axis=1, how="all")  # drop any columns that are completely empty
    df = df.dropna(axis=0)  # drop any rows that have NaN values in them
    return df

feat_file_name = "cnc_features_54_comp.csv"
label_file_name = "high_level_labels_MASTER_update2022-08-18_with_case.csv"

df = load_cnc_features(path_data_dir, path_processed_dir, feat_file_name, label_file_name)
df.head()

Unnamed: 0,id,current__time_reversal_asymmetry_statistic__lag_1,current__time_reversal_asymmetry_statistic__lag_2,current__time_reversal_asymmetry_statistic__lag_3,current__c3__lag_1,current__c3__lag_2,current__c3__lag_3,current__cid_ce__normalize_True,current__cid_ce__normalize_False,current__symmetry_looking__r_0.0,...,"current__matrix_profile__feature_""max""__threshold_0.98","current__matrix_profile__feature_""mean""__threshold_0.98","current__matrix_profile__feature_""median""__threshold_0.98","current__matrix_profile__feature_""25""__threshold_0.98","current__matrix_profile__feature_""75""__threshold_0.98",unix_date,tool_no,index_no,case_tool_54,y
0,1540298934_54_1,-84437.69,-246271.1,-459912.2,1328925000.0,1327239000.0,1325897000.0,4.621212,2723.857742,0.0,...,21.892176,6.575366,5.699876,5.326978,6.176852,1540298934,54,1,1,0
1,1540298934_54_2,-6912358.0,-13106910.0,-19315070.0,6473963000.0,6450330000.0,6453858000.0,9.929669,8008.548807,0.0,...,7.838717,3.066461,1.857761,1.624711,4.336304,1540298934,54,2,1,0
2,1540298934_54_5,-270712.8,-452583.9,-518830.6,3376005000.0,3375504000.0,3373084000.0,4.38928,2097.344273,0.0,...,3.672813,2.370622,2.496121,1.804606,2.813998,1540298934,54,5,1,0
3,1540298934_54_3,-38746.79,18835.53,98456.39,2574682000.0,2572420000.0,2568122000.0,7.268473,2443.065083,0.0,...,3.613585,1.835852,1.862852,1.627676,2.047316,1540298934,54,3,1,0
4,1540298934_54_9,13164.37,143714.9,337513.7,415367100.0,411065400.0,403886200.0,26.105369,2815.061101,0.0,...,8.261546,5.619089,5.728254,4.895724,5.969812,1540298934,54,9,1,0


In [14]:
n_sub_cuts = df.id.unique().shape[0]
print("n_sub_cuts:", n_sub_cuts)

n_sub_cuts: 43679


In [15]:
# get the percentage of each y label
df_p = df.groupby('y').size() / df.shape[0] * 100
df_p = df_p.reset_index()
df_p.columns = ['y', 'percentage']

# get the count of each tool_class
df_c = df.groupby('y').size().to_frame().reset_index()
df_c.columns = ['y', 'count']

# merge the two dataframes
df_pc = df_p.merge(df_c, on='y')[['y', 'count', 'percentage']]

# show the percentage with only two decimal places
df_pc['percentage'] = df_pc['percentage'].round(2)
df_pc

Unnamed: 0,y,count,percentage
0,0,42504,97.31
1,1,1175,2.69
