In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 28 16:58:28 2021

@author: hamdi
"""

import numpy as np
from scipy.spatial import distance
from sklearn.datasets import make_circles
from scipy.optimize import least_squares
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd



In [2]:
# charger les données 
domain = pd.read_csv("Servo/servo.domain",delimiter=":", names=["column","type" ])
servo = pd.read_csv("Servo/servo.data",names=domain.column.to_list())

# fonction qui prépare les données 
def preprocessing(dataset):
    # séparer les colonnes en deux types catégorielles et numériques
    cat_col=[col for col in dataset.columns if dataset[col].dtype=='object']
    num_col=[col for col in dataset.columns if dataset[col].dtype=='int64' or dataset[col].dtype=='float64']
    # garder uniquement les colonnes catégorielles et supprimer les valeurs manquantes 
    dataset = dataset[num_col].dropna(axis=0)
    # 10 pérmutation aléatoire pour mélanger les données 
    df_shuffled=dataset.sample(frac=1).reset_index(drop=True)
    #for i in range(10):
     #   df_shuffled=df_shuffled.sample(frac=1).reset_index(drop=True)
    # centrer et réduire les données
    data = StandardScaler().fit_transform(df_shuffled)
    # diviser les données (entrée, sortie)
    size = data.shape[1]
    X = data[:,:size-1]
    Y = data[:,size-1]
    return X,Y


##################### la classe MLM  ###################################
class MLM_kernel:
    def __init__(self,compute_dist,rp_number=None,dist="euclidean",random_state=42):
        # random state
        self.random_state = random_state
        self.compute_dist = compute_dist
        self.dist=dist
        # number of reference points
        self.rp_number = rp_number
        #    if None, set rp_number to 10% of samples,
        #    if rp_number in [0,1], use as percentual.
        if self.rp_number == None: self.rp_number = 0.1


    def pinv_(self,X):
        try:
            # si la matrice est inversible 
            return np.linalg.inv(X.T @ X) @ X.T
        except Exception as e:
            # sinon la pseudo inverse de Moor Penrose est calculé  
            return np.linalg.pinv(X)
        
    def select_RPs(self):
        N = self.X.shape[0]
        # si rp_number est un pourcentage 
        if self.rp_number <= 1:    self.rp_number = int(self.rp_number * N)

        # si tous les point sont utilisés comme références 
        if self.rp_number == N:
            rp_id     = np.arange(N)
        # sinon une selection aléatoire est faite 
        else:
            r = np.random.RandomState(self.random_state)
            rp_id     = r.choice(N, self.rp_number, replace=False)
        # selection des références en entrée et en sortie 
        self.rp_X     = self.X[rp_id,:] 
        self.rp_y     = self.y[rp_id,:] 
       # construire les matrices des distances (entrée,sortie)
        self.D_x = self.compute_dist(self.X,self.rp_X,self.dist)  
        self.D_y = self.compute_dist(self.y,self.rp_y,self.dist)

 

    def fit( self,X, y):
        self.X = X
        self.y = y
        self.select_RPs() # construire les matrices de distances
        self.B        = self.pinv_(self.D_x) @ self.D_y          # calculer la matrice des coefficients selon l'equation 3, cas ou K < N
        # je ne comprend pas pourquoi ils ne traitent pas les 2 autre cas (K=N,K>N) ??

        self.X_red = 1 - self.B.shape[0] / self.X.shape[0] # je ne comprend pas à quoi servent ces deux lignes de codes 
        self.y_red = 1 - self.B.shape[1] / self.y.shape[0]



    def predict( self,X, y=None):
        return np.array([self.get_output(x)[0] for x in X])

    def get_output(self, x):
        J = lambda y: self.in_cost(y, x)
        # on utilise les moindres carrés pour minimiser la fonction de coût 
        # le point de départ est la moyenne des points références 
        # la méthode utiliser est Levenberg-Marquardt
        out = least_squares(J, x0=self.rp_y.mean(axis=0), method='lm')
        return out.x

    def in_cost( self,y, x):
        """la fonction de coût"""
        # make y a vector
        y  = np.array([y])

        # compute pairwise distance vectors
        #  - d_in: input space
        #  - d_out: output space

        d_x  = self.compute_dist(x[np.newaxis],self.rp_X,self.dist)
        d_y  = self.compute_dist(y,self.rp_y,self.dist)

        # compute the internal cost function
        # print(((d_y**2 - (d_x @ self.B)**2) / np.abs(d_y))[0])
        return ((d_y**2 - (d_x @ self.B)**2)**2)[0]

    
    
########################### fonction that compute the distances between points ##################################### 
def compute_dist(X,u,methode="euclidean"):
    dic = {'euclidean':distance.euclidean,'cosine':distance.cosine,'manhatan':distance.cityblock,"braycurtis":distance.braycurtis}
    assert methode in dic.keys()
    
    nbrl = len(X)
    nbrc = len(u)
    
    if len(X.shape) == 1:# si X est un vecteur 
        m = np.zeros(nbrc)
        for i in range(nbrc):
            m[i] = dic[methode](X,u[i])# calcul la distance entre les 2 observations
    else: # si X est une matrice 
        m = np.zeros((nbrl,nbrc))
        for i in range(nbrl):
            for j in range(nbrc):
                m[i,j] = dic[methode](X[i],u[j]) # calcul la distance entre les 2 observations
    return m




In [1]:

############################## test ##############################################"
x,y=preprocessing(servo)
y = y.reshape((len(y),1))
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1/3,random_state=42)
clf = MLM_kernel(compute_dist,0.3)
clf.fit(x_train,y_train)
ypred1 = clf.predict(x_test)

clf = MLM_kernel(compute_dist,0.3,dist="manhatan")
clf.fit(x_train,y_train)
ypred2 = clf.predict(x_test)

clf = MLM_kernel(compute_dist,0.3,dist="cosine")
clf.fit(x_train,y_train)
ypred3 = clf.predict(x_test)

clf = MLM_kernel(compute_dist,0.3,dist="braycurtis")
clf.fit(x_train,y_train)
ypred4 = clf.predict(x_test)

print("With euclidean distance :",mean_squared_error(ypred1,y_test))
print("With manhatan distance :",mean_squared_error(ypred2,y_test))
print("With cosinus distance :",mean_squared_error(ypred3,y_test))
print("With braycurtis distance :",mean_squared_error(ypred4,y_test))

With euclidean distance : 0.45110125048589794
With manhatan distance : 0.40639191985965467
With cosinus distance : 1.0588464636534964
With braycurtis distance : 0.5207760479598439


In [None]:
domain = pd.read_csv("Abalone/abalone.domain",delimiter=":", names=["column","type" ])# Pour charger les noms des dolonnes
abalone = pd.read_csv("Abalone/abalone.data",names=domain.column.to_list()) # charher la dataset,

x,y=preprocessing(abalone)
y = y.reshape((len(y),1))
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1/3,random_state=42)

clf = MLM_kernel(compute_dist,0.3)
clf.fit(x_train,y_train)
ypred1 = clf.predict(x_test)

clf = MLM_kernel(compute_dist,0.3,dist="manhatan")
clf.fit(x_train,y_train)
ypred2 = clf.predict(x_test)



print("With euclidean distance :",mean_squared_error(ypred1,y_test))
print("With manhatan distance :",mean_squared_error(ypred2,y_test))