<a href="https://colab.research.google.com/github/kkokay07/genomicclass/blob/master/ML_Random_forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Random Forest Classification - Practical Implementation

## Brief Introduction
Random Forest is an ensemble learning method that combines multiple decision trees to improve prediction accuracy and control overfitting. Key features:
- Uses bootstrap sampling (random sampling with replacement)
- Creates multiple decision trees
- Combines predictions through voting
- Provides feature importance ranking

In [None]:
# Step 1: Import Required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

# Set random seed for reproducibility
np.random.seed(42)

In [None]:
# Step 2: Load and Preview Data
data = pd.read_csv('your_data.csv')  # Replace with your data file

print("Dataset Shape:", data.shape)
print("\nFirst few rows:")
display(data.head())

# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())

In [None]:
# Step 3: Data Preprocessing

# Separate features and target
X = data.drop('target', axis=1)  # Replace 'target' with your target column name
y = data['target']

# Encode categorical target if needed
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

In [None]:
# Step 4: Create and Train Random Forest Model
rf_model = RandomForestClassifier(
    n_estimators=100,        # Number of trees
    max_features='sqrt',     # Number of features to consider at each split
    max_depth=None,          # Maximum depth of the trees
    min_samples_split=2,     # Minimum samples required to split an internal node
    min_samples_leaf=1,      # Minimum samples required to be at a leaf node
    bootstrap=True,          # Use bootstrap samples
    random_state=42,         # Random state for reproducibility
    n_jobs=-1               # Use all available cores
)

# Train the model
rf_model.fit(X_train, y_train)

In [None]:
# Step 5: Make Predictions and Evaluate

# Make predictions
y_pred = rf_model.predict(X_test)

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred,
                          target_names=label_encoder.classes_))

In [None]:
# Step 6: Feature Importance Analysis

# Get feature importances
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
})

# Sort by importance
feature_importance = feature_importance.sort_values('importance',
                                                   ascending=False)

# Plot feature importances
plt.figure(figsize=(12, 6))
sns.barplot(x='importance', y='feature',
            data=feature_importance.head(10))
plt.title('Top 10 Most Important Features')
plt.xlabel('Feature Importance')
plt.tight_layout()
plt.show()

# Print importance values
print("\nTop 10 Feature Importances:")
print(feature_importance.head(10))

In [None]:
# Step 7: Save Model (Optional)
import joblib

# Save the model
joblib.dump(rf_model, 'random_forest_model.joblib')

# Save the label encoder
joblib.dump(label_encoder, 'label_encoder.joblib')

print("Model and label encoder saved successfully!")