## Encoders

In [22]:
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

X = [ "le chat adore le poisson", "le chien aime la promenade", "le cheval adore galoper" ]
label_encoder = LabelEncoder()
X_labels = label_encoder.fit_transform(" ".join(X).split(" "))
X_labels

array([7, 2, 0, 7, 8, 7, 4, 1, 6, 9, 7, 3, 0, 5])

In [52]:
X = [ "bleu", "rouge", "vert", "rouge", "jaune", "orange", "bleu", "vert", "rouge", "rouge", "orange" ]
label_encoder = LabelEncoder()
X_labels = label_encoder.fit_transform(" ".join(X).split(" "))
print("Classes :", label_encoder.classes_)
print("Encodage par labels :", X_labels)

Classes : ['bleu' 'jaune' 'orange' 'rouge' 'vert']
Encodage par labels : [0 3 4 3 1 2 0 4 3 3 2]


In [8]:
X = np.asarray([ ["Paris"], ["Marseille"], ["Paris"], ["Nice"], ["Bordeaux"], ["Bordeaux"], ["Marseille"] ])
# Création de l'objet Label Encoder / l'option .ravel() signifie que X est un array en 1 dimension 
label_encoder = LabelEncoder()
X_labels = label_encoder.fit_transform(X.ravel())
print("Classes :", label_encoder.classes_)
print("Encodage par labels :", X_labels)
# Redimensionnement matriciel
X_labels = X_labels.reshape(len(X_labels), 1)
# Objet One Hot Encoder
one_hot_encoder = OneHotEncoder(sparse=False)
print("Encodage one-hot :")
print(one_hot_encoder.fit_transform(X_labels))

Classes : ['Bordeaux' 'Marseille' 'Nice' 'Paris']
Encodage par labels : [3 1 3 2 0 0 1]
Encodage one-hot :
[[0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]]


## RandomForestRegressor

In [83]:
import pandas as pd 
df = pd.read_csv("salary.dat",delim_whitespace=True)
print(len(df))
df.head(15)

52


Unnamed: 0,sx,rk,yr,dg,yd,sl
0,male,full,25,doctorate,35,36350
1,male,full,13,doctorate,22,35350
2,male,full,10,doctorate,23,28200
3,female,full,7,doctorate,27,26775
4,male,full,19,masters,30,33696
5,male,full,16,doctorate,21,28516
6,female,full,0,masters,32,24900
7,male,full,16,doctorate,18,31909
8,male,full,13,masters,30,31850
9,male,full,13,masters,31,32850


In [84]:
sx_label_encoder, rk_label_encoder, dg_label_encoder = LabelEncoder(), LabelEncoder(), LabelEncoder()
df.rk=rk_label_encoder.fit_transform(df.rk)
df.dg=dg_label_encoder.fit_transform(df.dg)
df.sx=sx_label_encoder.fit_transform(df.sx)
df.head(15)

Unnamed: 0,sx,rk,yr,dg,yd,sl
0,1,2,25,0,35,36350
1,1,2,13,0,22,35350
2,1,2,10,0,23,28200
3,0,2,7,0,27,26775
4,1,2,19,1,30,33696
5,1,2,16,0,21,28516
6,0,2,0,1,32,24900
7,1,2,16,0,18,31909
8,1,2,13,1,30,31850
9,1,2,13,1,31,32850


In [85]:
from sklearn.model_selection import train_test_split
#définition des features et de la target
features = ['sx', 'rk', 'yr', 'dg', 'yd']
target = ['sl']
#split 
x_train, x_test, y_train, y_test = train_test_split(df[features],df[target],test_size=0.2)

In [86]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape


((41, 5), (11, 5), (41, 1), (11, 1))

In [87]:
52-41

11

In [88]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state = 123)

from pprint import pprint
#affichage des parametres par defaut 
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 123,
 'verbose': 0,
 'warm_start': False}


In [89]:
from sklearn.model_selection import RandomizedSearchCV

#nombre d'arbre dans la random forest
n_estimators = [int(x) for x in np.linspace(start = 2, stop = 100, num = 10)]
#nombre de features dans chaque split
max_features = ['auto', 'sqrt']
#profondeur : nombre de niveau dans l'arbre 
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
#nombre minimum de sample pour faire un split
min_samples_split = [2, 5, 10]
#nombre minimum de sample pour former une feuille 
min_samples_leaf = [1, 2, 4]
#methode de selection de sample pour chaque arbre 
bootstrap = [True, False]

#creation du grid 
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap
              }

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [2, 12, 23, 34, 45, 56, 67, 78, 89, 100]}


In [90]:
#créons le modele de random forest 
rf = RandomForestRegressor(random_state = 42)
#random search of parameters, en utilisant 3 fold cross validation 
#recherche dans 100 differentes combinaisons et utilisation de tout les core du processeur 
rf_ex = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                              n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs=-1,
                              return_train_score=True)

#fit du modele 
target = np.array(y_train)
rf_ex.fit(x_train.values,target.ravel());

Fitting 3 folds for each of 100 candidates, totalling 300 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(


In [91]:
rf_ex.best_params_

{'n_estimators': 12,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 90,
 'bootstrap': False}

In [92]:
rf_ex.best_estimator_

In [93]:
from sklearn.metrics import r2_score

y_pred = rf_ex.predict(x_test)
r2_score(y_test, y_pred)



0.48239421418656925

[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=12; total time=   0.0s
[CV] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=34; total time=   0.0s
[CV] END bootstrap=False, max_depth=60, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=23; total time=   0.0s
[CV] END bootstrap=False, max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=45; total time=   0.0s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=34; total time=   0.0s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=78; total time=   0.1s
[CV] END bootstrap=False, max_depth=50, max_feat

[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=34; total time=   0.0s
[CV] END bootstrap=False, max_depth=60, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=23; total time=   0.0s
[CV] END bootstrap=False, max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=45; total time=   0.1s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END bootstrap=False, max_depth=70, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=23; total time=   0.0s
[CV] END bootstrap=False, max_depth=110, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=89; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, max

[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=56; total time=   0.1s
[CV] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=45; total time=   0.0s
[CV] END bootstrap=True, max_depth=70, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=12; total time=   0.0s
[CV] END bootstrap=True, max_depth=70, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=12; total time=   0.0s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END bootstrap=False, max_depth=70, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=23; total time=   0.0s
[CV] END bootstrap=True, max_depth=20, max_f

In [42]:
from sklearn.cluster import KMeans
import numpy as np

# Générer des données aléatoires
X = np.random.rand(10, 2)

# Instancier le modèle de clustering
kmeans = KMeans(n_clusters=4)

# Fitter le modèle aux données
kmeans.fit(X)

# Prédire les clusters pour de nouvelles données
labels = kmeans.predict(X)

In [43]:
X

array([[0.82983399, 0.77343971],
       [0.95016455, 0.10728676],
       [0.88356244, 0.19697933],
       [0.42549005, 0.59121105],
       [0.58490909, 0.85332655],
       [0.64768723, 0.95451454],
       [0.33276923, 0.01561548],
       [0.1809302 , 0.408225  ],
       [0.3192006 , 0.41956561],
       [0.55750594, 0.93444432]])

In [44]:
labels

array([0, 2, 2, 1, 0, 0, 3, 1, 1, 0], dtype=int32)

In [48]:
from sklearn.decomposition import PCA
import numpy as np

# Générer des données aléatoires
X = np.random.rand(100, 10)

# Instancier le modèle de PCA
pca = PCA(n_components=3)

# Fitter le modèle aux données
pca.fit(X)

# Transformer les données pour obtenir les nouvelles dimensions
X_pca = pca.transform(X)

In [47]:
X.shape

(100, 10)

In [51]:
X_pca.shape

(100, 3)