#   RandomForestClassifier

imports

In [3]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import warnings
import os
import random

Set random seed for reproducibility

In [4]:
random.seed(42)

Update plot parameters

In [5]:
plt.rcParams.update({'font.size': 25})
sns.set_theme(color_codes=True)

Ignore warnings

In [6]:
warnings.filterwarnings('ignore')

List files in input directory (typically for Kaggle, here it's redundant)

In [7]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


Load training and test datasets

In [8]:
train_df = pd.read_csv("C:/Users/dawou/OneDrive/Bureau/ML/ChuteDetc/Train.csv")
test_df = pd.read_csv('C:/Users/dawou/OneDrive/Bureau/ML/ChuteDetc/Test.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/Lina/Desktop/Machine Learning/Train.csv'

Drop the unnecessary 'Unnamed: 0' column

In [None]:
train_df.drop(['Unnamed: 0'], axis=1, inplace=True)
test_df.drop(['Unnamed: 0'], axis=1, inplace=True)

Separate features and target variable from training and test data

In [None]:
X_train = train_df.drop(['fall','label'], axis=1)
y_train = train_df['fall']
X_test = test_df.drop(['fall','label'], axis=1)
y_test = test_df['fall']

Function to calculate mutual information scores

In [None]:
def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y, discrete_features=False)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

Calculate mutual information scores for training data

In [None]:
mi_scores = make_mi_scores(X_train, y_train)

Function to plot mutual information scores

In [None]:
def plot_utility_scores(scores):
    y = scores.sort_values(ascending=True)
    width = np.arange(len(y))
    ticks = list(y.index)
    plt.barh(width, y)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores(overall feature)")

Plot mutual information scores

In [None]:
plt.figure(dpi=100, figsize=(8, 5))
plt.xlabel("Score")
plt.ylabel("Feature")
plot_utility_scores(mi_scores)

Standardize the features

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Define parameter grid for RandomizedSearchCV

In [None]:
n_estimators = [200, 400, 600, 800, 1000]
max_features = ['auto', 'sqrt']
max_depth = [None, 10, 30, 50, 70]
min_samples_split = [2, 5, 9, 12]
min_samples_leaf = [1, 3, 5, 7]
bootstrap = [True, False]
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}
print(random_grid)

Initialize Random Forest classifier and RandomizedSearchCV

In [None]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator=rf,
                               param_distributions=random_grid,
                               n_iter=100, cv=5,
                               verbose=2, 
                               random_state=42, 
                               n_jobs=-1)

Fit RandomizedSearchCV

In [None]:
rf_random.fit(X_train, y_train)

Get the best parameters from RandomizedSearchCV

In [None]:
rf_random.best_params_

Function to evaluate model performance

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    accuracy = (((predictions == test_labels).sum()) / test_labels.shape[0]) * 100
    print('Model Performance')
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    return accuracy

Get the best parameters from the randomized search

In [None]:
params = rf_random.best_params_

Define new ranges for min_samples_split and min_samples_leaf

In [None]:
min_split = [2, 4, 6, 8, 12]
min_samples_leaf = [1, 2, 3, 4, 5]

Lists to store results

In [None]:
x = []
y = []
acc = []
highest_accuracy = 0

Grid search with new ranges for min_samples_split and min_samples_leaf

In [None]:
for split in min_split:
    params['min_samples_split'] = split
    for leaf in min_samples_leaf:
        params['min_samples_leaf'] = leaf
        model = RandomForestClassifier(**params)
        model.fit(X_train, y_train)
        accuracy = evaluate(model, X_test, y_test)
        acc.append(accuracy)
        x.append(split)
        y.append(leaf)
        highest_accuracy = max(highest_accuracy, accuracy)

Print the highest accuracy obtaine

In [None]:
print(f"The highest accuracy obtained is: {highest_accuracy}%.")