In [1]:
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load preprocessed dataset containing feature data and labels
data_dict = pickle.load(open('./data.pickle', 'rb'))

# Check the type and size of the data
print(type(data_dict['data']))
print(len(data_dict['data']))  # Number of samples
print(data_dict['data'][0])    # Inspect the first sample

# Ensure that each sample has 42 features
# If the feature length is not 42, pad or truncate accordingly
required_feature_length = 42
padded_data = []

# Iterate through each sample and adjust the length to 42
for features in data_dict['data']:
    if len(features) > required_feature_length:
        features = features[:required_feature_length]  # Truncate if longer than 42
    elif len(features) < required_feature_length:
        features = features + [0] * (required_feature_length - len(features))  # Pad with zeros if shorter than 42
    padded_data.append(features)

# Convert to NumPy arrays for model training
data = np.asarray(padded_data)
labels = np.asarray(data_dict['labels'])

# Check the shape of the data to ensure it's correct
print(f"Data shape: {data.shape}")

# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(
    data, labels, test_size=0.2, shuffle=True, stratify=labels
)

# Initialize the Random Forest Classifier
model = RandomForestClassifier()

# Train the model on the training data
model.fit(x_train, y_train)

# Predict class labels for the test set
y_predict = model.predict(x_test)

# Evaluate the accuracy of the model
score = accuracy_score(y_predict, y_test)

# Print the accuracy score
print(f'{score * 100}% of samples were classified correctly!')

# Save the trained model using pickle for future use
with open('model_rf.p', 'wb') as f:
    pickle.dump({'model': model}, f)


<class 'list'>
960
[0.3537756765930895, 1.0, 0.7227924074579333, 0.8875175446134456, 0.9252179359171128, 0.7023958667830529, 0.9727052097215105, 0.5329665359583543, 1.0, 0.39706305809208076, 0.6880827423264532, 0.4882206106053137, 0.7261915125199591, 0.28815793242666693, 0.7278568870437309, 0.1718507136191428, 0.7120400648038386, 0.07017430790174715, 0.46772982665031576, 0.4711318555906825, 0.5065093037209626, 0.2510784943367269, 0.513367690507336, 0.11227629204488405, 0.5079853846661685, 0.0, 0.2553716434713872, 0.4913190911023369, 0.25325435974076127, 0.28283104103818885, 0.279401995878775, 0.15733805812977059, 0.30407647226878287, 0.05417629641690373, 0.026825061768714797, 0.5456408702429788, 0.00634513243831746, 0.3872437280362546, 0.0007755778566241359, 0.28191272845687304, 0.0, 0.1881681463691884]
Data shape: (960, 42)
100.0% of samples were classified correctly!


In [3]:
# with open('model_rf.h5', 'wb') as f:
#     pickle.dump({'model': model}, f)

In [1]:
# model.save('model_rf.h5')

# Hyperparameter Tuning

In [10]:
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import numpy as np

# Load preprocessed dataset containing feature data and labels
data_dict = pickle.load(open('./data.pickle', 'rb'))

# Check the type and size of the data
print(type(data_dict['data']))
print(len(data_dict['data']))  # Number of samples
print(data_dict['data'][0])    # Inspect the first sample

# Ensure that each sample has 42 features
required_feature_length = 42
padded_data = []

# Adjust each sample to have a length of 42 features
for features in data_dict['data']:
    if len(features) > required_feature_length:
        features = features[:required_feature_length]  # Truncate if longer than 42
    elif len(features) < required_feature_length:
        features = features + [0] * (required_feature_length - len(features))  # Pad with zeros if shorter than 42
    padded_data.append(features)

# Convert to NumPy arrays for model training
data = np.asarray(padded_data)
labels = np.asarray(data_dict['labels'])

# Check the shape of the data to ensure it's correct
print(f"Data shape: {data.shape}")

# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(
    data, labels, test_size=0.2, shuffle=True, stratify=labels
)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],             # Number of trees
    'max_depth': [None, 10, 20, 30],             # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],             # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],               # Minimum samples required at each leaf node
    'bootstrap': [True, False]                   # Whether to bootstrap samples
}

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,  # 3-fold cross-validation
    verbose=2,
    n_jobs=-1  # Use all available cores
)

# Fit the GridSearchCV to the training data
grid_search.fit(x_train, y_train)

# Print the best parameters and best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_}")

# Train the final model using the best parameters
best_model = grid_search.best_estimator_

# Predict class labels for the test set
y_predict = best_model.predict(x_test)

# Evaluate the accuracy of the final model
final_score = accuracy_score(y_test, y_predict)
print(f'Final Test Accuracy: {final_score * 100:.2f}%')

# Save the best model using pickle
with open('best_model_rf.p', 'wb') as f:
    pickle.dump({'model': best_model}, f)


<class 'list'>
960
[0.3537756765930895, 1.0, 0.7227924074579333, 0.8875175446134456, 0.9252179359171128, 0.7023958667830529, 0.9727052097215105, 0.5329665359583543, 1.0, 0.39706305809208076, 0.6880827423264532, 0.4882206106053137, 0.7261915125199591, 0.28815793242666693, 0.7278568870437309, 0.1718507136191428, 0.7120400648038386, 0.07017430790174715, 0.46772982665031576, 0.4711318555906825, 0.5065093037209626, 0.2510784943367269, 0.513367690507336, 0.11227629204488405, 0.5079853846661685, 0.0, 0.2553716434713872, 0.4913190911023369, 0.25325435974076127, 0.28283104103818885, 0.279401995878775, 0.15733805812977059, 0.30407647226878287, 0.05417629641690373, 0.026825061768714797, 0.5456408702429788, 0.00634513243831746, 0.3872437280362546, 0.0007755778566241359, 0.28191272845687304, 0.0, 0.1881681463691884]
Data shape: (960, 42)
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best Parameters: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 

In [11]:
import pickle

# Save the model
with open('best_model_rf.pkl', 'wb') as file: