In [15]:
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 28 16:58:28 2021

@author: hamdi
"""

import numpy as np
from scipy.spatial import distance
from sklearn.datasets import make_circles
from scipy.optimize import least_squares
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.spatial.distance import cdist
import pandas as pd
import random
from skmlm import MLM

In [2]:
# charger les données 
domain = pd.read_csv("Servo/servo.domain",delimiter=":", names=["column","type" ])
servo = pd.read_csv("Servo/servo.data",names=domain.column.to_list())

domain = pd.read_csv("Housing/housing.domain",delimiter=":", names=["column","type" ])
housing = pd.read_csv("Housing/housing.data",names=domain.column.to_list())

def preprocessing(dataset):
    # séparer les colonnes en deux types catégorielles et numériques
    cat_col=[col for col in dataset.columns if dataset[col].dtype=='object']
    num_col=[col for col in dataset.columns if dataset[col].dtype=='int64' or dataset[col].dtype=='float64']
    # garder uniquement les colonnes catégorielles et supprimer les valeurs manquantes 
    dataset = dataset[num_col].dropna(axis=0)
    # 10 pérmutation aléatoire pour mélanger les données 
    df_shuffled=dataset.sample(frac=1).reset_index(drop=True)
    for i in range(10):
        df_shuffled=df_shuffled.sample(frac=1).reset_index(drop=True)
    data = df_shuffled.to_numpy()
    size = data.shape[1]
    X = data[:,:size-1]
    y = data[:,size-1]
    y = y.reshape((len(y),1))
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=1/3,random_state=42)
    # centrer réduire
    mx,my = x_train.mean(axis=0),y_train.mean(axis=0)
    ecart1,ecart2 = np.sqrt(np.var(x_train)),np.sqrt(np.var(y_train))
    return (x_train-mx)/ecart1, (x_test-mx)/ecart1, (y_train-my)/ecart2, (y_test-my)/ecart2

In [6]:
np.empty((3, 0))

array([], shape=(3, 0), dtype=float64)

In [86]:
class KMLM():
    def __init__(self,rp_number=None,random_state=42):
        self.rp_number = rp_number
        if self.rp_number == None: self.rp_number = 0.1
        self.random_state = random_state
        
    def selectRP(self,X,y):
        n = len(X)
        if self.rp_number <= 1:
            self.rp_number = int(self.rp_number * n)
        y = y.reshape((n,1))
        r = np.random.RandomState(self.random_state)
        id_rp = r.choice(range(n),self.rp_number,replace=False)
        self.rp_x = X[id_rp]
        self.rp_y = y[id_rp]
        return self.rp_x,self.rp_y

    def pinv_(self,D_x):
        try:
            return np.linalg.inv(D_x.T @ D_x) @ D_x.T
        except Exception as e:
            return np.linalg.pinv(D_x)
    
    def fit(self,D_x,D_y):
        self.B = self.pinv_(D_x) @ D_y 

    def cost(self,y,x,d_x):
        d_y  = cdist(y.reshape((len(y),1)),self.rp_y)# elle seras modifier selon la méthode de calcul de la distance 
        print(d_y.shape,d_x.shape,self.B.shape)
        return ((d_y**2 - (d_x@self.B)**2)**2)[0]

    def optimse(self,x,d_x):
        J = lambda y: self.cost(y,x,d_x)
        out = least_squares(J,x0 =self.rp_y.mean(axis=0),method='lm')
        return out.x[0]

    def predict(self,X,D_x):
        return np.array([self.optimse(x,d_x) for x,d_x in zip(X,D_x)])
    




In [90]:
x_train, x_test, y_train, y_test = preprocessing(servo)
k = KMLM(0.1)
rp_x,rp_y=k.selectRP(x_train,y_train)
D_x,D_y = cdist(x_train,rp_x,"euclidean"),cdist(y_train.reshape((len(y_train),1)),rp_y,"euclidean")
k.fit(D_x,D_y)
D_x_test =  cdist(x_test,rp_x)
ypred1 = k.predict(x_test,D_x_test)

    
clf = MLM(0.3)
B1 = clf.fit(x_train,y_train)
ypred2 = clf.predict(x_test)
mean_squared_error(y_test,clf.predict(x_test))
D_x.shape

(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11

(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11

(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11

(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11,) (11, 11)
(1, 11) (11

(111, 11)

In [7]:
from sklearn.ensemble import RandomForestRegressor

In [62]:
regr = RandomForestRegressor(max_depth=100, random_state=0)
regr.fit(x_train,y_train.ravel())
mean_squared_error(y_test,regr.predict(x_test))

0.3794480004332866

In [63]:
X_dissim = np.empty((len(x_test), 0))
leaves = regr.apply(x_train)
X_leaves = regr.apply(x_test)
for xi in leaves:
    matches = X_leaves == np.reshape(xi, (1, len(xi)))
    dissim = np.sum(matches, axis=1) / regr.n_estimators
    X_dissim = np.append(X_dissim, np.reshape(dissim, (len(x_test), 1)), axis=1)

In [82]:
forest = RandomForestRegressor(max_depth=100, random_state=0)
forest.fit(x_train,y_train.ravel())
leaves_ = forest.apply(x_train)
a = leaves_[:, 0]
dissim_matrix_ = 1 * np.equal.outer(a, a)
for i in range(1, forest.n_estimators):
    a = leaves_[:, i]
    dissim_matrix_ += 1 * np.equal.outer(a, a)
dissim_matrix_1 = dissim_matrix_ / forest.n_estimators

leaves_ = forest.apply(x_test)
a = leaves_[:, 0]
dissim_matrix_ = 1 * np.equal.outer(a, a)
for i in range(1, forest.n_estimators):
    a = leaves_[:, i]
    dissim_matrix_ += 1 * np.equal.outer(a, a)
dissim_matrix_2 = dissim_matrix_ / forest.n_estimators

In [89]:
dissim_matrix_1.shape,dissim_matrix_2.shape

((111, 111), (56, 56))

In [87]:
k = KMLM(0.1)
rp_x,rp_y=k.selectRP(x_train,y_train)
k.fit(dissim_matrix_1,y_train)
k.predict(dissim_matrix_2,y_test)

(1, 11) (1,) (111, 1)


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 111 is different from 1)