In [1]:
'''
Random Forest With PCA

'''
# It is assumed that the dataset is imported and the input features are stored
# in a numpy array ("features") and the target variable (containing classes)
# is a numpy array named "outputs"

# Loading dataset
from sklearn.datasets import load_iris
data = load_iris()
# Features array
features = data['data']
# Outputs array
outputs = data['target']


# Importing all the needed libraries
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# PCA object (for dimensionality reduction)
pca = PCA(n_components=0.99)
# Normalization object (for normalizing the data)
scaler = StandardScaler()
# Cross validation object
kf = KFold(n_splits=10, shuffle=True)
# Creating grid hyperparameters for a Random Forest Classifier
gridPars = [{
    'max_depth': [5, None],
    'n_estimators': [10, 50],  
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2,5],
    'min_samples_leaf': [1,5],
    'max_features': ['auto','log2','sqrt'],
    'max_samples': [0.7,None]
             }]
# Creating a GridSearchCV object for identifying the best hyper-parameters
grid = GridSearchCV(RandomForestClassifier(), gridPars, refit=True, verbose=3,n_jobs=-1) 
# Empty vectors for true and predicted outputs
testPred = np.array([])
testReal = np.array([])
# Splitting the data in a CV loop
for trainInd, testInd in kf.split(range(features.shape[0])):
    # Splitting the data
    X0 = features[testInd].copy() # Test inputs
    y0 = outputs[testInd].copy() # Test outputs
    X = features[trainInd].copy() # Tarin inputs
    y = outputs[trainInd].copy() # Train outputs
    # Normalization
    X = scaler.fit_transform(X)
    X0 = scaler.transform(X0)
    # Dimensionality reduction using PCA
    X = pca.fit_transform(X)
    X0 = pca.transform(X0)
    # Finding and training the best model
    grid.fit(X, y)
    # Predicting the test data
    y_hat = grid.predict(X0)
    # Adding the real and predicted values from the current iteration
    # of k-fold cross validation to the cumaltive vectors
    testPred = np.concatenate((testPred, y_hat))
    testReal = np.concatenate((testReal, y0))
# Classification metrics using the real and predited values
metrics = classification_report(testReal, testPred)
confMat = confusion_matrix(testReal, testPred)
print('\n','Accuracy: \n', str(metrics))
print('\n','Confusion Matrix: \n', str(confMat))

Fitting 5 folds for each of 192 candidates, totalling 960 fits
Fitting 5 folds for each of 192 candidates, totalling 960 fits
Fitting 5 folds for each of 192 candidates, totalling 960 fits


KeyboardInterrupt: 