# Ensemble Classification

### 1. Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
sns.set_style(style='darkgrid')

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier, StackingClassifier
import lightgbm as lgb

from custom_stacking import CustomStackingClassifier
import time
import os

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
SEED = 42
np.random.seed(SEED)

### 2. Data Preparation and Splitting

In [None]:
DATA_PATH = os.path.join(os.path.dirname(os.getcwd()), 'data', 'RHMD_Engineered.csv')
df = pd.read_csv(DATA_PATH)

In [None]:
mentalhealth_df = df[df['subreddit'] == 'mentalhealth'].copy()
df = df[df['subreddit'] != 'mentalhealth'].copy()

In [None]:
print(f"Number of posts from specific mental health subreddits: {len(df)}")
print(f"Number of general mentalhealth posts to be classified: {len(mentalhealth_df)}")

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(y='subreddit', data=df)
plt.title('Distribution of Posts Across Specific Mental Health Subreddits')
plt.xlabel('Count')
plt.ylabel('Subreddit')
plt.tight_layout()
plt.show()

In [None]:
feature_columns = [
    'title_sentiment', 'text_sentiment', 'overall_sentiment', 'textblob_sentiment', 'textblob_subjectivity',
    'negative_emotion', 'positive_emotion', 'sadness', 'fear', 'death', 'suffering', 'pain', 'shame', 'violence', 'love', 'optimism',
    'title_word_count', 'text_word_count',
    'dominant_topic', 'topic_contribution',
    'distress_score'
]

In [None]:
existing_features = [col for col in feature_columns if col in df.columns]
print(f"Using {len(existing_features)} features for prediction:")
print(existing_features)

In [None]:
X = df[existing_features]
y = df['subreddit']

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
print("\nLabel encoding mapping:")
for i, subreddit in enumerate(label_encoder.classes_):
    print(f"{subreddit} -> {i}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=SEED, stratify=y_encoded)

In [None]:
print(f"\nTraining set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

In [None]:
X_mentalhealth = mentalhealth_df[existing_features]
print(f"Mentalhealth posts feature set shape: {X_mentalhealth.shape}")

### 3. Feature Engineering for Different Model Architectures

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
pca = PCA(n_components=min(15, len(existing_features)))
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [None]:
selector = SelectKBest(f_classif, k=min(20, len(existing_features)))
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

In [None]:
selected_indices = selector.get_support(indices=True)
selected_features = [existing_features[i] for i in selected_indices]
print("\nTop selected features based on ANOVA F-test:")
print(selected_features)

In [None]:
explained_variance = pca.explained_variance_ratio_
print("\nPCA explained variance ratio:")
print(sum(explained_variance))
plt.figure(figsize=(10, 6))
plt.bar(range(1, len(explained_variance) + 1), explained_variance)
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained Variance by Principal Components')
plt.xticks(range(1, len(explained_variance) + 1))
plt.tight_layout()
plt.show()

In [None]:
X_train_nn = X_train_scaled.copy()
X_test_nn = X_test_scaled.copy()

In [None]:
X_train_svm = X_train_scaled.copy()
X_test_svm = X_test_scaled.copy()

In [None]:
X_train_rf = X_train.copy()
X_test_rf = X_test.copy()

In [None]:
X_train_lgb = X_train.copy()
X_test_lgb = X_test.copy()

In [None]:
X_train_lr = X_train_pca.copy()
X_test_lr = X_test_pca.copy()

### 4. Base Model Training and Evaluation

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"{model_name} - Accuracy: {accuracy:.4f}, F1: {f1:.4f}, Time: {train_time:.2f}s")

    return model, accuracy, f1, train_time, y_pred

In [None]:
models = {
    "Neural Network": MLPClassifier(
        hidden_layer_sizes=(100, 50),
        activation='relu',
        alpha=0.0001,
        max_iter=300,
        random_state=SEED
    ),
    "SVM": SVC(
        C=1.0,
        kernel='rbf',
        probability=True,
        random_state=SEED
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        min_samples_split=2,
        random_state=SEED
    ),
    "LightGBM": lgb.LGBMClassifier(
        n_estimators=500,
        learning_rate=0.01,
        max_depth=8,
        colsample_bytree=0.65,
        minnum_leaves=68,
        reg_alpha=10,
        reg_lambda=5.5,
        subsample=1,
        random_state=SEED
    ),
    "Logistic Regression": LogisticRegression(
        C=1.0,
        max_iter=1000,
        random_state=SEED
    )
}

In [None]:
results = {}
predictions = {}

In [None]:
for name, model in models.items():
    if name == "Neural Network":
        trained_model, acc, f1, time_taken, preds = evaluate_model(
            model, X_train_nn, X_test_nn, y_train, y_test, name
        )
    elif name == "SVM":
        trained_model, acc, f1, time_taken, preds = evaluate_model(
            model, X_train_svm, X_test_svm, y_train, y_test, name
        )
    elif name == "Random Forest":
        trained_model, acc, f1, time_taken, preds = evaluate_model(
            model, X_train_rf, X_test_rf, y_train, y_test, name
        )
    elif name == "LightGBM":
        trained_model, acc, f1, time_taken, preds = evaluate_model(
            model, X_train_lgb, X_test_lgb, y_train, y_test, name
        )
    elif name == "Logistic Regression":
        trained_model, acc, f1, time_taken, preds = evaluate_model(
            model, X_train_lr, X_test_lr, y_train, y_test, name
        )

    results[name] = {
        'model': trained_model,
        'accuracy': acc,
        'f1': f1,
        'time': time_taken
    }
    predictions[name] = preds

In [None]:
base_results = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [results[m]['accuracy'] for m in results],
    'F1 Score': [results[m]['f1'] for m in results]
})

In [None]:
base_results = base_results.sort_values('Accuracy', ascending=False)

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='Accuracy', y='Model', data=base_results)
plt.title('Base Model Accuracy Comparison')
plt.xlabel('Accuracy')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='Accuracy', y='Model', data=base_results)
plt.title('Base Model Accuracy Comparison')
plt.xlabel('Accuracy')
plt.tight_layout()
plt.show()

### 5. Bagging With SVM

In [None]:
bagging_svm = BaggingClassifier(
    estimator=SVC(probability=True, random_state=SEED),
    n_estimators=10,
    max_samples=0.8,
    max_features=0.8,
    bootstrap=True,
    bootstrap_features=False,
    random_state=SEED
)

In [None]:
bagging_svm, acc_svm, f1_svm, time_svm, preds_svm = evaluate_model(
    bagging_svm, X_train_svm, X_test_svm, y_train, y_test, "Bagging with SVM"
)

In [None]:
print(f"Bagging with SVM - Accuracy: {acc_svm:.4f}, F1: {f1_svm:.4f}")

### 6. Voting

In [None]:
voting_clf = VotingClassifier(
    estimators=[
        ('nn', MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', random_state=SEED)),
        ('svm', SVC(probability=True, random_state=SEED)),
        ('rf', RandomForestClassifier(n_estimators=100, random_state=SEED)),
        ('lgb', lgb.LGBMClassifier(n_estimators=500, learning_rate=0.01, max_depth=8, colsample_bytree=0.65, min_child_samples=13, num_leaves=68, reg_alpha=10, reg_lambda=5.5, subsample=1, random_state=SEED))
    ],
    voting='soft'
)

In [None]:
nn_model = MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', random_state=SEED)
nn_model.fit(X_train_nn, y_train)

svm_model = SVC(probability=True, random_state=SEED)
svm_model.fit(X_train_svm, y_train)

rf_model = RandomForestClassifier(n_estimators=100, random_state=SEED)
rf_model.fit(X_train_rf, y_train)

lgb_model = lgb.LGBMClassifier(n_estimators=500, learning_rate=0.01, max_depth=8, colsample_bytree=0.65, minum_leaves=68, reg_alpha=10, reg_lambda=5.5, subsample=1, random_state=SEED)
lgb_model.fit(X_train_lgb, y_train)

In [None]:
voting_clf.estimators[0] = nn_model
voting_clf.estimators[1] = svm_model
voting_clf.estimators[2] = rf_model
voting_clf.estimators[3] = lgb_model
voting_clf._fitted = True

In [None]:
nn_pred_proba = nn_model.predict_proba(X_test_nn)
svm_pred_proba = svm_model.predict_proba(X_test_svm)
rf_pred_proba = rf_model.predict_proba(X_test_rf)
lgb_pred_proba = lgb_model.predict_proba(X_test_lgb)

In [None]:
avg_pred_proba = (nn_pred_proba + svm_pred_proba + rf_pred_proba + lgb_pred_proba) / 4
voting_pred = np.argmax(avg_pred_proba, axis=1)

In [None]:
voting_accuracy = accuracy_score(y_test, voting_pred)
voting_f1 = f1_score(y_test, voting_pred, average='weighted')

In [None]:
print(f"Voting Classifier - Accuracy: {voting_accuracy:.4f}, F1: {voting_f1:.4f}")

In [None]:
bagging_and_voting_results = pd.DataFrame({
    'Model': ['Voting Classifier', 'Bagging with SVM'],
    'Accuracy': [voting_accuracy, acc_svm],
    'F1 Score': [voting_f1, f1_svm]
})

In [None]:
comparison_df = pd.concat([base_results, bagging_and_voting_results])
comparison_df = comparison_df.sort_values('Accuracy', ascending=False)

In [None]:
plt.figure(figsize=(12, 8))
sns.barplot(x='Accuracy', y='Model', data=comparison_df)
plt.title('Model Accuracy Comparison')
plt.xlabel('Accuracy')
plt.tight_layout()
plt.show()

### 7. Stacking

In [None]:
base_models = [
    ('nn', MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', random_state=SEED)),
    ('svm', SVC(probability=True, random_state=SEED)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=SEED)),
    ('lgb', lgb.LGBMClassifier(n_estimators=500, learning_rate=0.01, max_depth=8, colsample_bytree=0.65, min_child_samples=13, num_leaves=68, reg_alpha=10, reg_lambda=5.5, subsample=1, random_state=SEED))
]

In [None]:
meta_model = LogisticRegression(C=1.0, max_iter=1000, random_state=SEED)

In [None]:
feature_sets = {
    'nn': [X_train_nn, X_test_nn],
    'svm': [X_train_svm, X_test_svm],
    'rf': [X_train_rf, X_test_rf],
    'lgb': [X_train_lgb, X_test_lgb]
}

In [None]:
stacking_clf = CustomStackingClassifier(
    base_models=base_models,
    meta_model=meta_model,
    feature_sets=feature_sets,
)

In [None]:
stacking_clf.fit(X_train, y_train)

In [None]:
stacking_pred = stacking_clf.predict(X_test)

In [None]:
stacking_accuracy = accuracy_score(y_test, stacking_pred)
stacking_f1 = f1_score(y_test, stacking_pred, average='weighted')

In [None]:
print(f"Stacking Classifier - Accuracy: {stacking_accuracy:.4f}, F1: {stacking_f1:.4f}")

In [None]:
stacking_results = pd.DataFrame({
    'Model': ['Stacking Classifier'],
    'Accuracy': [stacking_accuracy],
    'F1 Score': [stacking_f1]
})

In [None]:
comparison_df = pd.concat([comparison_df, stacking_results])
comparison_df = comparison_df.sort_values('Accuracy', ascending=False)

In [None]:
plt.figure(figsize=(12, 8))
sns.barplot(x='Accuracy', y='Model', data=comparison_df)
plt.title('Model Accuracy Comparison')
plt.xlabel('Accuracy')
plt.tight_layout()
plt.show()

In [None]:
best_model_name = comparison_df.iloc[comparison_df['Accuracy'].argmax()]['Model']
print(f"\nBest model based on accuracy: {best_model_name}")

In [None]:
if best_model_name == 'Stacking Classifier':
    best_preds = stacking_pred
elif best_model_name == 'Voting Classifier':
    best_preds = voting_pred
elif best_model_name == 'Bagging-SVM':
        best_preds = preds_svm
else:
    model_key = best_model_name
    best_preds = predictions[model_key]

In [None]:
cm = confusion_matrix(y_test, best_preds)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='rocket',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

In [None]:
print("\nClassification Report for Best Model:")
print(classification_report(y_test, best_preds, target_names=label_encoder.classes_))

### 8. Classifying Mentalhealth Posts

In [None]:
X_mentalhealth_nn = scaler.transform(X_mentalhealth)
X_mentalhealth_svm = scaler.transform(X_mentalhealth)
X_mentalhealth_rf = X_mentalhealth.copy()
X_mentalhealth_lgb = X_mentalhealth.copy()
X_mentalhealth_lr = pca.transform(scaler.transform(X_mentalhealth))

In [None]:
feature_sets_pred = {
    'nn': (X_train_nn, X_mentalhealth_nn),
    'svm': (X_train_svm, X_mentalhealth_svm),
    'rf': (X_train_rf, X_mentalhealth_rf),
    'lgb': (X_train_lgb, X_mentalhealth_lgb)
}

In [None]:
stacking_clf.feature_sets = feature_sets_pred
mentalhealth_preds = stacking_clf.predict(X_mentalhealth)
mentalhealth_probs = stacking_clf.predict_proba(X_mentalhealth)

In [None]:
mentalhealth_df['predicted_label'] = mentalhealth_preds
mentalhealth_df['predicted_subreddit'] = label_encoder.inverse_transform(mentalhealth_preds)

In [None]:
for i, class_name in enumerate(label_encoder.classes_):
    mentalhealth_df[f'prob_{class_name}'] = mentalhealth_probs[:, i]

In [None]:
mentalhealth_df['confidence'] = mentalhealth_probs.max(axis=1)

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(y='predicted_subreddit', data=mentalhealth_df)
plt.title('Distribution of Predicted Subreddits for Mentalhealth Posts')
plt.xlabel('Count')
plt.ylabel('Predicted Subreddit')
plt.tight_layout()
plt.show()

In [None]:
predicted_counts = mentalhealth_df['predicted_subreddit'].value_counts()
predicted_percentages = predicted_counts / predicted_counts.sum() * 100

In [None]:
print("\nDistribution of mentalhealth posts into specific subreddits:")
for subreddit, percentage in predicted_percentages.items():
    print(f"{subreddit}: {percentage:.2f}%")

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(mentalhealth_df['confidence'], bins=20, kde=True)
plt.title('Confidence Distribution for Mentalhealth Post Classifications')
plt.xlabel('Confidence Level (Max Probability)')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
high_confidence = mentalhealth_df[mentalhealth_df['confidence'] >= 0.8]
low_confidence = mentalhealth_df[mentalhealth_df['confidence'] < 0.5]

In [None]:
print(f"\nHigh confidence predictions (>=80%): {len(high_confidence)} posts ({len(high_confidence)/len(mentalhealth_df)*100:.2f}%)")
print(f"Low confidence predictions (<50%): {len(low_confidence)} posts ({len(low_confidence)/len(mentalhealth_df)*100:.2f}%)")

In [None]:
if len(high_confidence) > 0:
    plt.figure(figsize=(10, 6))
    sns.countplot(y='predicted_subreddit', data=high_confidence)
    plt.title('Distribution of High Confidence (>=80%) Predictions')
    plt.xlabel('Count')
    plt.ylabel('Predicted Subreddit')
    plt.tight_layout()
    plt.show()

In [None]:
SAVE_PATH = os.path.join(os.path.dirname(os.getcwd()), 'classifications', 'ensemble_classification.csv')
mentalhealth_df.to_csv(SAVE_PATH, index=False)