In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.svm import SVR,SVC
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedShuffleSplit,cross_val_score
from sklearn.datasets import make_classification,make_moons,make_circles,make_regression

sns.set_theme(style="darkgrid")
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
import time
from sklearn.metrics import accuracy_score, jaccard_score, precision_score
import warnings
import RFD_MLM as rfd_mlm
from sklearn.datasets import load_iris,load_wine,load_breast_cancer
warnings.filterwarnings('ignore')


In [None]:
from google.colab import files 
import io
uploaded = files.upload()

Saving abalone.data to abalone.data
Saving abalone.domain to abalone.domain
Saving diabetes - Copie.csv to diabetes - Copie.csv
Saving diabetes.csv to diabetes.csv
Saving housing.data to housing.data
Saving housing.domain to housing.domain
Saving price.data to price.data
Saving price.domain to price.domain
Saving servo.data to servo.data
Saving servo.domain to servo.domain


In [None]:
from google.colab import files 
import io
uploaded = files.upload()

Saving RFD_MLM.py to RFD_MLM.py


In [None]:
def to_pandas(X,y):
    n = len(X)
    return pd.DataFrame(data = np.concatenate((X,y.reshape((n,1))),axis=1),columns=["c"+str(i) for i in range(X.shape[1]+1)])

# 6 datasets pour la régression 
domain = pd.read_csv("abalone.domain",delimiter=":", names=["column","type" ])# Pour charger les noms des dolonnes
abalone = pd.read_csv("abalone.data",names=domain.column.to_list()) # charher la dataset, 

domain = pd.read_csv("price.domain",delimiter=":", names=["column","type","unknow"])
price = pd.read_csv("price.data",names=domain.column.to_list())

domain = pd.read_csv("housing.domain",delimiter=":", names=["column","type" ])
housing = pd.read_csv("housing.data",names=domain.column.to_list())

domain = pd.read_csv("servo.domain",delimiter=":", names=["column","type" ])
servo = pd.read_csv("servo.data",names=domain.column.to_list())

# 3 dataset pour la classification
cancer = to_pandas(load_breast_cancer().data,load_breast_cancer().target)
iris = to_pandas(load_iris().data,load_iris().target)
wine = to_pandas(load_wine().data,load_wine().target)
diabetes = pd.read_csv("diabetes.csv")




def split_and_norm(X,y,name,test_size=1/3):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,random_state=42)    
    if name == "c":
        rus = RandomUnderSampler(random_state=42) # equilibré les proportions de classes
        X,y = rus.fit_resample(X,y)
        sss = StratifiedShuffleSplit(n_splits=1, test_size=1/3,random_state=42)
        for train_idx,test_idx in sss.split(X,y):
            X_train,X_test=X[train_idx],X[test_idx]
            y_train,y_test=y[train_idx],y[test_idx]
    # centrer réduire
    mx = X_train.mean(axis=0)
    ecart1 = X_train.std(axis=0)
    return (X_train-mx)/ecart1,(X_test-mx)/ecart1,y_train,y_test

def preprocessing(dataset):
    # séparer les colonnes en deux types catégorielles et numériques
    cat_col=[col for col in dataset.columns if dataset[col].dtype=='object']
    num_col=[col for col in dataset.columns if dataset[col].dtype=='int64' or dataset[col].dtype=='float64']
    # garder uniquement les colonnes catégorielles et supprimer les valeurs manquantes 
    dataset = dataset[num_col].dropna(axis=0)
    # 10 pérmutation aléatoire pour mélanger les données 

    for i in range(10):
        dataset=shuffle(dataset,random_state=0)
    data = dataset.to_numpy()
    size = data.shape[1]
    X = data[:,:size-1]
    y = data[:,size-1]
    return X,y


In [None]:
def rf_test(dataset,name):
    X,y = preprocessing(dataset)# pré traiter les données
    x_train, x_test, y_train, y_test = split_and_norm(X,y,name,test_size=1/3)# divisier les données et normaliser 

    if name == 'r':
        param_grid = {'max_depth': [10,20,40,60,80,100],
              'criterion':['mse'],
              'min_samples_split': [2, 5, 10,15],
              'n_estimators': [100, 150, 200, 250, 300, 350, 400]}
        grid = GridSearchCV(RandomForestRegressor(),param_grid) 
    elif name =='c':
        param_grid = {'max_depth': [10,20,40,60,80,100],
              'criterion':['gini'],
              'min_samples_split': [2, 5, 10,15],
              'n_estimators': [100, 150, 200, 250, 300, 350, 400]}
        grid = GridSearchCV(RandomForestClassifier(),param_grid) 
    # entrainement du modèle 
    grid.fit(x_train,y_train) 
    clf = grid.best_estimator_
    pred = clf.predict(x_test)# prediction
    if name == "r":
        return mean_squared_error(y_test,pred)
    elif name == "c":
        return accuracy_score(y_test,pred)

In [None]:
for name,ds in ds_regression.items():
  print("Dataset "+name+" ################################")
  print(rf_test(ds,"r"))

Dataset servo ################################
0.6142018901910943
Dataset price ################################
3023859.219288678
Dataset housing ################################
14.379006172255101
Dataset abalone ################################
5.175021811847339


In [None]:
ds_regression = {"servo":servo,"price":price,"housing":housing,"abalone":abalone}
ds_classification = {"iris":iris,"wine":wine,"diabetes":diabetes,"cancer":cancer}
RFMLM = {"regression":[],"classification":[]}
RF = {"regression":[],"classification":[]}


In [None]:
for name,ds in [('price',price)]:
  print("Dataset "+name+" ################################")
  X,y = preprocessing(ds)
  l1,l2=[],[]
  for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3)    
    mx,ecart = X_train.mean(0),X_train.std(0)
    X_train,X_test = (X_train-mx)/ecart,(X_test-mx)/ecart
    clf = rfd_mlm.RFDMLM_Regressor()
    clf.fit(X_train,y_train)
    rf = RandomForestRegressor()
    rf.fit(X_train,y_train)

    l1.append(mean_squared_error(y_test,clf.predict(X_test)))
    l2.append(mean_squared_error(y_test,rf.predict(X_test)))
  RFMLM["regression"].append(np.mean(l1))
  RF["regression"].append(np.mean(l2))
  
  print("RFMLM results : ",RFMLM["regression"][-1])
  print("RF results : ",RF["regression"][-1])


Dataset price ################################
RFMLM results :  10402140.547713151
RF results :  3973929.214945833


In [None]:
for name,ds in ds_classification.items():
  print("Dataset "+name+" ################################")
  X,y = preprocessing(ds)
  rus = RandomUnderSampler(random_state=42) # equilibré les proportions de classes
  X,y = rus.fit_resample(X,y)
  l1,l2=[],[]
  for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3)    
    mx,ecart = X_train.mean(0),X_train.std(0)
    X_train,X_test = (X_train-mx)/ecart,(X_test-mx)/ecart
    clf = rfd_mlm.RFDMLM_Classifier()
    clf.fit(X_train,y_train)
    rf = RandomForestClassifier()
    rf.fit(X_train,y_train)

    l1.append(accuracy_score(y_test,clf.predict(X_test)))
    l2.append(accuracy_score(y_test,rf.predict(X_test)))
  RFMLM["classification"].append(np.mean(l1))
  RF["classification"].append(np.mean(l2))
  
  print("RFMLM results : ",RFMLM["classification"][-1])
  print("RF results : ",RF["classification"][-1])


Dataset iris ################################
RFMLM results :  0.9359999999999999
RF results :  0.9359999999999999
Dataset wine ################################
RFMLM results :  0.9625
RF results :  0.975
Dataset diabetes ################################
RFMLM results :  0.7396648044692737
RF results :  0.7508379888268156
Dataset cancer ################################
RFMLM results :  0.9690140845070422
RF results :  0.9633802816901408


In [None]:
 from collections import Counter
RF,RFMLM

({'classification': [0.9359999999999999,
   0.975,
   0.7508379888268156,
   0.9633802816901408],
  'regression': [1.1498193166748405,
   5746245.372060936,
   13.509830106508872,
   4.801612950466618]},
 {'classification': [0.9359999999999999,
   0.9625,
   0.7396648044692737,
   0.9690140845070422],
  'regression': [1.151710651545938,
   30016305.74791224,
   12.197427018143403,
   5.15553863953056]})

In [None]:
X,y = preprocessing(servo)
X_train,X_test,y_train,y_test = split_and_norm(X,y,'r')
clf = RandomForestRegressor().fit(X_train,y_train)
pred = clf.predict(X_test)
mean_squared_error(pred,y_test)

0.5914366840174965

In [None]:
sr

[(0.6341922839553853, 0.6145731263797768),
 (3597756.054615723, 3133466.796971698),
 (5.743125314363903, 5.337364249820532),
 (11.941279854856484, 13.445936473372788)]

In [None]:
param_grid = {'max_depth': [10,20,40,60,80,100],
              'criterion':['mse'],
              'min_samples_split': [2, 5, 10,15],
              'n_estimators': [100, 150, 200, 250, 300, 350, 400]}

In [None]:
for name,ds in ds_regression.items():
  print("Dataset "+name+" #####################################")
  grid = GridSearchCV(RandomForestRegressor(),param_grid) 
  X,y = preprocessing(ds)
  X_train,X_test,y_train,y_test = split_and_norm(X,y,'r')
  grid.fit(X_train,y_train)
  pred = grid.best_estimator_.predict(X_test)
  print(mean_squared_error(y_test,pred))

Dataset servo #####################################
0.614681682875987
Dataset price #####################################
3030979.3377699293
Dataset housing #####################################
14.009710023298803
Dataset abalone #####################################
5.17021641860689


In [None]:
param_grid = {'max_depth': [10,20,40,60,80,100],
              'criterion':['gini'],
              'min_samples_split': [2, 5, 10,15],
              'n_estimators': [100, 150, 200, 250, 300, 350, 400]}
for name,ds in ds_classification.items():
  print("Dataset "+name+" #####################################")
  grid = GridSearchCV(RandomForestClassifier(),param_grid) 
  X,y = preprocessing(ds)
  X_train,X_test,y_train,y_test = split_and_norm(X,y,'c')
  grid.fit(X_train,y_train)
  pred = grid.best_estimator_.predict(X_test)
  print(accuracy_score(y_test,pred))

Dataset iris #####################################
0.96
Dataset wine #####################################
0.9583333333333334
Dataset diabetes #####################################
0.7932960893854749
Dataset cancer #####################################
0.9577464788732394


## Compare RFD_MLM and RF in high dimensionality

In [None]:
s = []
for i in range(1,31):
    X, y = make_classification(n_samples=i*400, n_features=i*40, n_redundant=0, n_informative=2,n_classes=3,
                               random_state=42, n_clusters_per_class=1)
    df = to_pandas(X,y)
    X,y = preprocessing(df)
    X_train,X_test,y_train,y_test = split_and_norm(X,y,"c")


    rfd_c = rfd_mlm.RFDMLM_Classifier()
    rfd_c.fit(X_train,y_train)
    s.append(accuracy_score(y_test,rfd_c.predict(X_test)))
    print("RF : ",s[-1])


RF :  0.8257575757575758
RF :  0.8458646616541353
RF :  0.8517587939698492
RF :  0.832391713747646
RF :  0.8954545454545455
RF :  0.9459119496855346
RF :  0.8258064516129032
RF :  0.7768361581920904
RF :  0.9382303839732888
RF :  0.837593984962406
RF :  0.8824333561175667
RF :  0.9755485893416928
RF :  0.8212420197330238
RF :  0.8641975308641975
RF :  0.8471943887775552
RF :  0.8352112676056338
RF :  0.9194333776007083
RF :  0.8604262432093607
RF :  0.8912183544303798
RF :  0.8613935969868173
RF :  0.8444762245262781
RF :  0.9137872049264454
RF :  0.957530218882718
RF :  0.8181818181818182
RF :  0.9368231046931408
RF :  0.9246100519930676
RF :  0.8970219871973282
RF :  0.8943982846421871
RF :  0.916882444329363
RF :  0.8864377036851341


In [None]:
 X, y = make_classification(n_samples=200, n_features=10, n_redundant=0, n_informative=2,n_classes=3,
                               random_state=42, n_clusters_per_class=1)
 X[0]

array([ 0.01035262,  0.65511414, -0.1869713 ,  1.52522257, -0.6095122 ,
       -0.3052247 ,  0.55781031, -1.06511366, -1.31183623,  1.39020755])

In [None]:
rfd_mlm

<RFD_MLM.RFDMLM_Classifier at 0x7f79224e3f90>

In [None]:
s_r = []
for i in range(1,30):
    X, y = make_regression(n_samples=i*100, n_features=i*10, n_informative=2,
                               random_state=42)
    df = to_pandas(X,y)
    X,y = preprocessing(df)
    X_train,X_test,y_train,y_test = split_and_norm(X,y,"r")


    rfd_c = rfd_mlm.RFDMLM_Regressor()
    rfd_c.fit(X_train,y_train)
    s_r.append(mean_squared_error(y_test,rfd_c.predict(X_test)))
    print("RF : ",s_r[-1])

RF :  29.68511240097921
RF :  69.57437242398458
RF :  989.4040119175385
RF :  688.0442581593887
RF :  819.4488601020369
RF :  634.0161792389123
RF :  171.97592864756328
RF :  18.222135354525076
RF :  123.16725341044071
RF :  100.04829454532984
RF :  160.55691619250024
RF :  117.48715071426872
RF :  68.70789279537149
RF :  124.54294086276398
RF :  30.647246430409506
RF :  58.89883266748135
RF :  64.70174407111634
RF :  12.505911136707095
RF :  18.950361871805768
RF :  84.79346440501247
RF :  303.5664263275131
RF :  29.12822026855631
RF :  49.610707318100204
RF :  127.60446748134241
RF :  211.6011854274029
RF :  36.767163706737314
RF :  11.767203934452208
RF :  98.00849368388984
RF :  62.168630713209694
