In [35]:
import numpy as np
import pandas as pd
import os

In [2]:
class MTS:   
    def __init__(self, ts):
             self.ts = ts
            
    def cov_mat(self, centering = True):
        X = self.ts
        if centering:
            X = (self.ts - (self.ts).mean(axis = 0))
        return X.transpose() @ X

In [3]:
class CPCA:
    def __init__(self, epsilon = 1e-5):
        self.cov = None
        self.epsilon = epsilon
        self.U = None
        self.V = None
        self.S = None
    
    def fit(self,listMTS):
        if (len(listMTS) > 0):
            P = listMTS[0].cov_mat().shape[1]
            cov_mat = [mat.cov_mat() for mat in listMTS]
            self.cov = sum(cov_mat)/len(cov_mat)
            #Add epsilon Id in order to ensure invertibility
            cov = self.cov + self.epsilon*np.eye(P)
            #Compute SVD
            U,S,V = np.linalg.svd(self.cov)
            #Save SVD
            self.U = U
            self.S = S
            self.V = V
        

    def pred(self, listMTS, ncp):
        predicted = []
        if (self.U is not None):
            predicted = [elem.ts @ self.U[:,:ncp] for elem in listMTS]
        return predicted
    
    def reconstitution_error(self, listMTS, ncp):
        mse = np.full(len(listMTS),np.inf)
        if (self.U is not None):
            prediction = self.pred(listMTS, ncp)
            reconstit = [elem @ ((self.U)[:,:ncp].transpose()) for elem in prediction]
            mse = [((listMTS[i].ts - reconstit[i])**2).sum() for i in range(len(prediction))]
        return mse

## MTS

In [7]:
#Import lp1 for test
import pandas as pd
res = [pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/robotfailure-mld/lp1.data", sep = "\t",skiprows=1+(18*i), nrows=15, header = None) for i in range(1,80)]
res = [MTS(elem.drop(columns = [0]).to_numpy()) for elem in res]

In [5]:
name = [pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/robotfailure-mld/lp1.data", sep = "\t",skiprows=(18*i), nrows=1, header = None) for i in range(1,80)]

In [6]:
name = [elem[0][0] for elem in name]

name_unique = list(np.unique(name))
dict_name = dict(zip(name_unique, list(range(len(name_unique)))))

gt_nb_cluster = np.array([dict_name.get(nom) for nom in name])

$$Pre $=\sum_{j=1}^{K} \underbrace{\frac{\left|C_{j}\right|}{N}}_{\text{prop_part}} \times \underbrace{\max _{i=1,2, \cdots, g} \frac{\left|G_{i} \cap C_{j}\right|}{\left|C_{j}\right|}}_{\text{max_part}}$$

In [33]:
class Mc2PCA:
    def __init__(self,K, ncp, itermax = 1000, conv_crit = 1e-5):
        self.K = K
        self.N = None
        self.ncp = ncp
        self.iter_max = itermax
        self.converged = False
        self.CPCA_final = None
        self.conv_crit = conv_crit
        self.pred = None
        
    def fit(self, X):
        N = len(X)
        #initialisation
        index_cluster = np.tile(np.arange(self.K), int(N/self.K) + 1)[:N]
        to_continue = True
        i = 0
        old_error = -1
        
        while to_continue:

            #Split all MTS according to the cluster 
            #we store it in a list of lists of MTS (each list inside the list corresponding to a cluster)
            MTS_by_cluster = [[X[i] for i in list(np.where(index_cluster == j)[0])] for j in range(self.K)]

            CPCA_by_cluster = [CPCA() for i in range(self.K)]

            #fit by cluster
            [CPCA_by_cluster[i].fit(MTS_by_cluster[i]) for i in range(self.K)]

            res = np.array([cpca.reconstitution_error(X, self.ncp) for cpca in CPCA_by_cluster])
            #Update index cluster
            index_cluster = res.argmin(axis = 0)

            #new total error 
            new_error = res.min(axis = 0).sum()
            to_continue = (abs(old_error - new_error) > self.conv_crit) & (self.iter_max > i)
            self.converged = np.abs(old_error - new_error) < self.conv_crit

            #Updata
            old_error = new_error 
            i += 1
        self.CPCA_final = CPCA_by_cluster
        self.pred = index_cluster
        return index_cluster
    
    def precision(self,gt_cluster):
        index_cluster = self.pred
        N = gt_cluster.shape[0]
        g = np.unique(gt_cluster)
        nb_g = g.shape[0]

        G = [np.where(gt_cluster == i)[0] for i in range(nb_g)]
        C = [np.where(index_cluster == i)[0] for i in range(self.K)]
        
        #to handle case where a cluster is empty
        max_part = list()
        for j in range(self.K):
            l = list()
            for i in range(nb_g):
                if len(C[j])!=0:
                    l.append([np.intersect1d(G[i],C[j]).shape[0]/C[j].shape[0]])
                else:
                    l.append(0)
            max_part.append(np.max(l))
        max_part = np.array(max_part)
        
        #max_part = np.array([max([np.intersect1d(G[i],C[j]).shape[0]/C[j].shape[0] for i in range(nb_g)]) for j in range(self.K)])
        prop_part = np.array([C[j].shape[0]/N for j in range(self.K)])
        return max_part.dot(prop_part)

### Search for best parameter ncp

In [57]:
def search_ncp(X,K,ncp_list,y_true):
    pres = np.zeros(ncp_list.shape[0])
    for i in range(len(ncp_list)):
        m = Mc2PCA(K,ncp_list[i])
        m.fit(X)
        pres[i] = m.precision(y_true)
    pre = np.max(pres)
    best_ncp = ncp_list[np.argmax(pres)]
    return best_ncp, pre

### Test on diabete data

In [64]:
path = 'datasets/diabete/diabetes-data/'
data_diabete = list()

for i in range(70):
    nb = str(i+1)
    data_pd = pd.read_csv(path+'data-'+nb.zfill(2), sep='\t', usecols = [1,2,3] , 
                       engine='python', header=None)
    data_diabete.append(data_pd.to_numpy())

In [70]:
res = [MTS(elem) for elem in data_diabete]
m = Mc2PCA(3,2)
m.fit(res)

TypeError: unsupported operand type(s) for /: 'str' and 'int'

### Test on activities data

In [75]:
path = 'datasets/activity_recognition/realistic_sensor_displacement/'
files = os.listdir(path)

files_subset = files[:3]

labels = list()
data_activity = list()
for f in files_subset:
    if '.log' in f:
        data_pd = pd.read_csv(path+f, sep='\t', engine='python', header=None)
        classes = data_pd[119]
        i=0
        while i < (len(classes)-1):
            start = i
            end = i
            d = list()
            labels.append(classes[i])
            while classes[i]==classes[i+1]:
                i += 1
                end = i
                if i==(len(classes)-1):
                    d.append(data_pd.iloc[start:end].drop(columns=[0,119]).to_numpy())
                    break
            d.append(data_pd.iloc[start:end].drop(columns=[0,119]).to_numpy())
            i +=1
            data_activity.append(d)

In [80]:
labels = np.array(labels)

data_activities = list()
for i in range(len(data_activity)):
    data_activities.append(data_activity[i][0])

In [78]:
res = [MTS(elem) for elem in data_activities]
m = Mc2PCA(33,6)
m.fit(res)

array([13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 13, 13, 13, 15, 13,
       13, 11, 15, 13, 26, 11, 15, 13, 15, 26, 26, 13, 13, 13, 13, 13, 13,
       13, 15, 13, 26, 11, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
       13, 13, 13, 11, 22, 11,  9, 13, 26, 13, 11, 11, 26, 26, 26,  3,  3,
       11,  3, 11,  8,  8,  8,  8,  8,  8,  8, 12,  8, 14,  8,  8,  8,  8,
        8,  8,  8, 12,  8,  8,  8, 12,  8, 12,  8, 12,  8,  8,  8,  8,  8,
        8,  8,  8,  8, 12,  8, 12,  8,  8, 12, 12,  8, 12, 12, 12,  8, 12,
        8, 12,  8, 12, 12, 12,  8, 27,  8,  8,  8,  8,  8,  8,  8,  8, 12,
       12, 12])

In [81]:
m.precision(labels)

0.5434782608695652

In [82]:
ncp_list = np.arange(1,int(data_activities[0].shape[1]*3/4))
search_ncp(res,3,ncp_list,labels)

(47, 0.5217391304347826)

### Test on gas sensor data

In [58]:
path = 'datasets/wine_banana/HT_Sensor_UCIsubmission/'

metadata_pd = pd.read_csv(path+'HT_Sensor_metadata.dat', sep='\t', usecols=[0,2],
                          names=['id','class'], engine='python', skiprows=[0])
metadata_pd.head()

Unnamed: 0,id,class
0,0,banana
1,1,wine
2,2,wine
3,3,banana
4,4,wine


In [59]:
name = metadata_pd['class']

name_unique = list(np.unique(name))
dict_name = dict(zip(name_unique, list(range(len(name_unique)))))

labels = np.array([dict_name.get(nom) for nom in name])

In [60]:
data_pd = pd.read_csv(path+'HT_Sensor_dataset.dat', sep='  ', engine='python', skiprows=[0], header=None)

data_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0,-0.99975,12.8621,10.3683,10.4383,11.6699,13.4931,13.3423,8.04169,8.73901,26.2257,59.0528
1,0,-0.999472,12.8617,10.3682,10.4375,11.6697,13.4927,13.3412,8.04133,8.73908,26.2308,59.0299
2,0,-0.999194,12.8607,10.3686,10.437,11.6696,13.4924,13.3405,8.04101,8.73915,26.2365,59.0093
3,0,-0.998916,12.8602,10.3686,10.437,11.6697,13.4921,13.3398,8.04086,8.73936,26.2416,58.9905
4,0,-0.998627,12.8595,10.3688,10.4374,11.6699,13.4919,13.339,8.04087,8.73986,26.2462,58.9736


In [61]:
ids = metadata_pd['id']
data_gas = list()

for i in range(ids.shape[0]):
    data_id = data_pd[data_pd[0]==ids[i]]
    data_id = data_id.drop(columns=[0])
    data_id = data_id[data_id[1] >= 0]
    data_gas.append(data_id.to_numpy())

In [31]:
res = [MTS(elem) for elem in data_gas]
m = Mc2PCA(3,1)
m.fit(res)

  X = (self.ts - (self.ts).mean(axis = 0))


array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2])

In [32]:
m.precision(labels)

0.37

In [63]:
ncp_list = np.arange(1,int(data_gas[0].shape[1]*3/4))
search_ncp(res,3,ncp_list,labels)

(7, 0.47)

### Test on sign language data

In [36]:
path = 'datasets/sign_language/tctodd/'

#list of lists, of all files per directory
directories = os.listdir(path)
directories = np.sort(directories)
files = list()
for direc in directories:
    f=os.listdir(path+direc)
    f = np.sort(f)
    f = [path+direc+'/'+name for name in f]
    files.append(f)

In [37]:
nb_signs = 95
sample_per_sign_per_file = 3
signs = list()

#create list of lists, of samples per signs (95 signs, and 27 samples per sign)
for i in range(nb_signs):
    sample_per_sign = list()
    for l in range(len(files)):
        for j in range(sample_per_sign_per_file):
            sample_per_sign.append(files[l][i*3+j])
    signs.append(sample_per_sign)

In [38]:
data_signs = list()

for i in range(len(signs)):
    for j in range(27):
        sign = pd.read_csv(signs[i][j], sep='\t', engine='python', header=None)
        data_signs.append(sign.to_numpy())

In [40]:
signs_names = list()

for i in range(nb_signs):
    for k in range(27):
        signs_names.append(signs[i][0].split('/')[-1][:-6])
        
unique_names = np.unique(signs_names)

dict_name = dict(zip(unique_names, list(range(len(unique_names)))))

labels = np.array([dict_name.get(nom) for nom in signs_names])

In [43]:
res = [MTS(elem) for elem in data_signs]
m = Mc2PCA(95,6)
m.fit(res)

array([28, 28, 28, ..., 62, 47, 47])

In [44]:
m.precision(labels)

0.08927875243664718

In [None]:
#trop long

ncp_list = np.arange(1,int(data_signs[0].shape[1]*3/4))
#search_ncp(res,95,ncp_list,labels)

### Test on pioneer data

In [45]:
path = 'datasets/pioneers/'

data_pd = pd.read_csv(path+'MOVE.DATA', sep=',', engine='python', header=None)
exp_names = np.unique(data_pd[0])

data_pioneers0 = list()
for i in range(exp_names.shape[0]):
    exp_n = data_pd[data_pd[0]==exp_names[i]]
    exp_n = exp_n.drop(columns=[0,1,2])
    data_pioneers0.append(exp_n.to_numpy())

#label 0: MOVE experience
labels_MOVE = np.zeros(len(data_pioneers0))

In [47]:
data_pd = pd.read_csv(path+'GRIPPER.DATA', sep=',', engine='python', header=None)
exp_names = np.unique(data_pd[0])

data_pioneers1 = list()
for i in range(exp_names.shape[0]):
    exp_n = data_pd[data_pd[0]==exp_names[i]]
    exp_n = exp_n.drop(columns=[0,1,2])
    data_pioneers1.append(exp_n.to_numpy())

#label 1: GRIPPER experience
labels_GRIP = np.ones(len(data_pioneers1))
labels = np.concatenate((labels_MOVE,labels_GRIP))
data_pioneers = data_pioneers0 + data_pioneers1

In [48]:
data_pd = pd.read_csv(path+'TURN.DATA', sep=',', engine='python', header=None)
exp_names = np.unique(data_pd[0])

data_pioneers2 = list()
for i in range(exp_names.shape[0]):
    exp_n = data_pd[data_pd[0]==exp_names[i]]
    exp_n = exp_n.drop(columns=[0,1,2])
    data_pioneers2.append(exp_n.to_numpy())

#label 2: TURN experience
labels_TURN = np.ones(len(data_pioneers2))*2
#final list of labels
labels = np.concatenate((labels,labels_TURN))
#final list of data
data_pioneers = data_pioneers + data_pioneers2

In [50]:
res = [MTS(elem) for elem in data_pioneers]
m = Mc2PCA(3,6)
m.fit(res)

array([1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 2, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 0, 1, 1, 2, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 2, 1, 2, 1, 2, 0, 2, 2, 1, 0, 1,
       0, 1, 2, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1,
       1, 1, 1, 1, 2, 1])

In [53]:
m.precision(labels)

0.6375

In [54]:
ncp_list = np.arange(1,int(data_pioneers[0].shape[1]*3/4))
search_ncp(res,3,ncp_list,labels)

(24, 0.675)