In [None]:
import os
import re
import pandas as pd
import numpy as np

## Read Data

In [None]:
df = pd.read_csv('features.csv',index_col=0)
df_validation = pd.read_csv('validation_features.csv',index_col=0)

# Fill all NaN values with 0
df = df.fillna(0)
df.columns = df.columns.astype(str)

df_validation = df_validation.fillna(0)
df_validation.columns = df_validation.columns.astype(str)

In [None]:
# DF used as training set
print('Training set shape:', df.shape)
# DF used as validation
print('Validation set shape:', df_validation.shape)
print()
print('Features:', df.columns)


In [None]:
df_bert = df[['Cleaned Content','Gender']]
df_validation_bert = df_validation[['Cleaned Content','Gender']]

In [None]:
df_id = df[['ID', 'Gender', 'Mapped Name', 'Cleaned Content']]
df_validation_id = df_validation[['ID', 'Gender', 'Mapped Name', 'Cleaned Content']]

In [None]:
df.drop(columns=['ID','Mapped Name','Cleaned Content'],inplace=True)
df_validation.drop(columns=['ID','Mapped Name','Cleaned Content'],inplace=True)

In [None]:
df['Gender_mapped'] = df['Gender'].map({'F': 0, 'M': 1})
df_validation['Gender_mapped'] = df_validation['Gender'].map({'F': 0, 'M': 1})

In [None]:
all_features = list(df.columns.drop(['Gender', 'Gender_mapped']))

## Descriptive Statistics

In [None]:
import pandas as pd

# Group by 'Gender' and calculate descriptive statistics
grouped_stats = df.groupby('Gender').describe()

# Flatten the MultiIndex columns
grouped_stats.columns = ['_'.join(col).strip() for col in grouped_stats.columns.values]

# # Export the descriptive statistics to a CSV file
# grouped_stats.to_csv('grouped_descriptive_statistics.csv')

# print("Descriptive statistics have been exported to 'grouped_descriptive_statistics.csv'.")

# Display the statistics by feature
for feature in all_features:
    print(f"\nDescriptive statistics for {feature} by Gender:")
    display_stats = grouped_stats[[f"{feature}_count", f"{feature}_mean", f"{feature}_std", 
                                   f"{feature}_min", f"{feature}_25%", f"{feature}_50%", 
                                   f"{feature}_75%", f"{feature}_max"]]
    display(display_stats)

### Correlation Test

In [None]:
# Remove categorical features
df_corr = df.drop('Gender',axis=1)
correlation_matrix = df_corr.corr(method='spearman')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(50, 50))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
# Show only strongly correlated grids

# Define the correlation threshold
threshold = 0.7

# Create a mask for the correlation matrix to keep only correlations above the threshold
high_corr = correlation_matrix[(correlation_matrix.abs() >= threshold) & (correlation_matrix.abs() != 1.0)]

# Drop rows and columns where all values are NaN (correlations below the threshold)
high_corr = high_corr.dropna(axis=0, how='all').dropna(axis=1, how='all')

# Plot the filtered correlation matrix
plt.figure(figsize=(12, 10))  # Adjust the figsize as needed
sns.heatmap(high_corr, annot=True, cmap='coolwarm', center=0, linewidths=0.5)
plt.title('High Correlation Matrix Heatmap')
plt.show()

### Mann-Whitney U Test

In [None]:
# Check if features are normally distributed
import pandas as pd
from scipy.stats import shapiro

# Dictionaries to store the results
normal_features = []
non_normal_features = []

# Iterate through each feature in the DataFrame
for feature in all_features:
    stat, p_value = shapiro(df[feature])
    
    # Determine if the feature is normally distributed
    if p_value >= 0.05:
        normal_features.append(feature)
    else:
        non_normal_features.append(feature)

# Print the results
print('Number of normally distributed features:', len(normal_features))        

# print("Normally Distributed Features:")
# for feature in normal_features:
#     print(f" - {feature}")

# print("\nNon-Normally Distributed Features:")
# for feature in non_normal_features:
#     print(f" - {feature}")

# ALL features are non-normally distributed, therefore we use Man-Whitney Test

In [None]:
# Test the shapes to see if they're similar (Optional)
import pandas as pd
from scipy.stats import ks_2samp

# Assuming df is your DataFrame, and 'group_label' is the column that indicates the group (e.g., male vs. female)
male_df = df[df['Gender'] == 'M']
female_df = df[df['Gender'] == 'F']

# Lists to store results
similar_shape_features = []
non_similar_shape_features = []

# Iterate through each feature in the DataFrame (excluding the group label column)
for feature in df.columns:
    if feature != 'Gender':  # Skip the group label column
        stat, p_value = ks_2samp(male_df[feature], female_df[feature])
        
        # Determine if the feature has a similar distribution shape
        if p_value >= 0.05:
            similar_shape_features.append(feature)
        else:
            non_similar_shape_features.append(feature)

# Print the results
print("Features with Similar Shape:")
for feature in similar_shape_features:
    print(f" - {feature}")

print("\nFeatures with Non-Similar Shape:")
for feature in non_similar_shape_features:
    print(f" - {feature}")

In [None]:
# Separate the df according to gender groups
male_df = df[df['Gender'] == 'M']
female_df = df[df['Gender'] == 'F']

# male_df.to_csv('male_df_for_r.csv')
# female_df.to_csv('female_df_for_r.csv')

In [None]:
from scipy.stats import mannwhitneyu, norm

# Dictionaries to store results
significant_features = {}
insignificant_features = {}

# Iterate through each feature in the DataFrame
for feature in all_features:
    
    # Perform the Mann-Whitney U test
    u_stat, p_value = mannwhitneyu(female_df[feature], male_df[feature])
    
    # Calculate the effect size (Rank-Biserial Correlation)
    n1 = len(female_df[feature])
    n2 = len(male_df[feature])
    rank_biserial = ((2 * u_stat) / (n1 * n2)) - 1
    
    # Calculate 95% CI for the effect size
    z = norm.ppf(1 - 0.05 / 2)  # z-value for 95% confidence
    
    # Use the absolute value of the effect size for the SE calculation
    se = np.sqrt((abs(rank_biserial) * (1 - abs(rank_biserial))) / (n1 * n2))
    ci_lower = rank_biserial - z * se
    ci_upper = rank_biserial + z * se
    
    # Store results based on the significance level
    if p_value < 0.05:  # Consider the feature significant if p-value < 0.05
        significant_features[feature] = {
            'u_stat': u_stat,
            'p_value': p_value,
            'effect_size': rank_biserial,
            'ci_lower': ci_lower,
            'ci_upper': ci_upper
        }
    else:
        insignificant_features[feature] = {
            'u_stat': u_stat,
            'p_value': p_value,
            'effect_size': rank_biserial,
            'ci_lower': ci_lower,
            'ci_upper': ci_upper
        }

# # Print the significant features
# print("Significant Features (p-value < 0.05):")
# for feature, (u_stat, p_value) in significant_features.items():
#     print(f"{feature}: U-stat = {u_stat}, p-value = {p_value}")

# print("\nInsignificant Features (p-value >= 0.05):")
# for feature, (u_stat, p_value) in insignificant_features.items():
#     print(f"{feature}: U-stat = {u_stat}, p-value = {p_value}")
            
print('Number of significant features:', len(significant_features))
print('Number of insignificant features:', len(insignificant_features))

sig_feature_list = list(significant_features.keys())

In [None]:
def format_number(value):
    if abs(value) < 0.01:  # Use scientific notation for very small numbers
        return f"{value:.2e}"
    elif value >= 1:  # One decimal place for large numbers
        return f"{value:.1f}"
    else:  # Two decimal places for medium-sized numbers
        return f"{value:.2f}"

In [None]:
significant_features_df = pd.DataFrame(significant_features).transpose()
significant_features_df = significant_features_df.applymap(format_number)
significant_features_df.head()

significant_features_df.to_csv('significant_features.csv')

In [None]:
# Get statistics of central tendency

def get_stat(df, feature_list):

    # Create a new dictionary to store the mean and IQR for each feature
    statistics_dict = {}

    for feature in feature_list:
        data = df[feature]
        
        # Calculate mean and IQR
        mean_value = np.mean(data)
        median_value = np.median(data)
        iqr_value = np.percentile(data, 75) - np.percentile(data, 25)
        sd_value = np.std(data, ddof=1)
        
        # Store the results in the dictionary
        statistics_dict[feature] = {'mean': mean_value, 'SD': sd_value, 'median': median_value, 'IQR': iqr_value}

        stat_df = pd.DataFrame(statistics_dict).transpose()

    return stat_df


In [None]:
sig_feature_list = list(significant_features.keys())

# Get central tendency of female features
female_stat_df = get_stat(female_df, sig_feature_list)
female_stat_df = female_stat_df.applymap(format_number)
display(female_stat_df.head())

# Get central tendency of male features
male_stat_df = get_stat(male_df, sig_feature_list)
male_stat_df = male_stat_df.applymap(format_number)
display(male_stat_df.head())

# Output as CSV files
female_stat_df.to_csv('female_stat.csv')
male_stat_df.to_csv('male_stat.csv')

In [None]:
# All Features
# Get central tendency of female features
female_stat_df = get_stat(female_df, all_features)
female_stat_df = female_stat_df.applymap(format_number)
display(female_stat_df.head())

# Get central tendency of male features
male_stat_df = get_stat(male_df, all_features)
male_stat_df = male_stat_df.applymap(format_number)
display(male_stat_df.head())

# Output as CSV files
female_stat_df.to_csv('female_stat_all.csv')
male_stat_df.to_csv('male_stat_all.csv')

In [None]:
from scipy.stats import mannwhitneyu, norm

# Dictionaries to store results
features_stat = {}

# Iterate through each feature in the DataFrame
for feature in all_features:
    
    # Perform the Mann-Whitney U test
    u_stat, p_value = mannwhitneyu(female_df[feature], male_df[feature])
    
    # Calculate the effect size (Rank-Biserial Correlation)
    n1 = len(female_df[feature])
    n2 = len(male_df[feature])
    rank_biserial = ((2 * u_stat) / (n1 * n2)) - 1
    
    # Calculate 95% CI for the effect size
    z = norm.ppf(1 - 0.05 / 2)  # z-value for 95% confidence
    
    # Use the absolute value of the effect size for the SE calculation
    se = np.sqrt((abs(rank_biserial) * (1 - abs(rank_biserial))) / (n1 * n2))
    ci_lower = rank_biserial - z * se
    ci_upper = rank_biserial + z * se
    
    # Store results
    features_stat[feature] = {
        'u_stat': u_stat,
        'p_value': p_value,
        'effect_size': rank_biserial,
        'ci_lower': ci_lower,
        'ci_upper': ci_upper
    }

features_df = pd.DataFrame(features_stat).transpose()
features_df = features_df.applymap(format_number)
features_df.head()

features_df.to_csv('features_stat.csv')

In [None]:
# Exclude the non-significant features for better performances
insig_features_list = list(insignificant_features.keys())

print('Number of Insignificant Features:', len(insig_features_list))
insig_features_list

## Feature Selection

In [None]:
features_exclude = []

# Correlation Test
features_exclude.append('F2 Character Count')

# Mann-Whitney Test
features_exclude += insig_features_list
# 0 in IQR
extra_features = ['F9 EX', 'F10 FW', 'F13 JJR', 'F14 JJS', 'F26 RBR', 'F29 SYM', 'F38 WDT', 'F48 ;']
features_exclude += extra_features

# Removing Formality Score
# f = 'F51 Formality Score'
# features_exclude.append(f)

# Feature Importance (no better result to remove least important features)
# features_exclude = features_exclude + least_important_features

features_exclude = list(set(features_exclude))
print('Number of total features to exclude:', len(features_exclude))

In [None]:
# Exclude the non-significant features for better performances
selected_features = list(set(all_features) - set(features_exclude))

print('Number of Selected Features:', len(selected_features))

# Sorting the list numerically based on the number after 'F'
selected_features = sorted(selected_features, key=lambda x: int(x.split()[0][1:]))
selected_features

## Baseline BERT Model

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from imblearn.under_sampling import RandomUnderSampler

X = df_bert['Cleaned Content']  # Replace with your actual text column
y = df_bert['Gender'].map({'F': 0, 'M': 1})

# Under sample the data before converting to BERT embeddings
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X.values.reshape(-1, 1), y)

# Flatten the resampled data back to a 1D array
X_resampled = X_resampled.flatten()

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings for a single text input
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    # Use the embeddings of the [CLS] token
    cls_embeddings = outputs.last_hidden_state[:, 0, :].squeeze()
    return cls_embeddings.numpy()

# Apply BERT embedding extraction to the resampled data
bert_embeddings = [get_bert_embeddings(text) for text in X_resampled]

# Convert the list of embeddings into a DataFrame
bert_embeddings_df = pd.DataFrame(bert_embeddings)

# Align the index with the resampled y labels for any further use
bert_embeddings_df.index = y_resampled.index

# Print the resulting DataFrame
print(bert_embeddings_df.head())

In [None]:
# Convert Validation into BERT Embedding
df_validation_bert['BERT_Embedding'] = df_validation_bert['Cleaned Content'].apply(get_bert_embeddings)

# Convert the embeddings into a DataFrame
bert_embeddings_df_val = pd.DataFrame(df_validation_bert['BERT_Embedding'].tolist(), index=df_validation.index)

# Print the resulting DataFrame
display(bert_embeddings_df_val.head())

#### Validation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler

# Prepare the data
X_train, y_train = bert_embeddings_df, y_resampled

# Check the distribution of male and female in the training set
print("Distribution in y_train:")
print(y_train.value_counts())

# Train a simple logistic regression model
model = LogisticRegression(max_iter=10000, random_state=42)
model.fit(X_train, y_train)

X_val, y_val = bert_embeddings_df_val, df_validation['Gender'].map({'F': 0, 'M': 1})

# Predict on the test set
y_pred = model.predict(X_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:")
print(classification_report(y_val, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

## Random Forest

#### Based Model

In [None]:
# Under-sampling before feature selection
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Assuming df is your dataframe containing features and target variable
X_train = df[selected_features]
y_train = df['Gender_mapped']

# Check the distribution of male and female in the training set
print("Distribution in y_train:")
print(y_train.value_counts())

# Handle the class imbalance using RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# Check the distribution of male and female in the resampled training set
print("Distribution in y_train_resampled:")
print(y_train_resampled.value_counts())

# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(class_weight='balanced', max_depth=20,
                       min_samples_leaf=4, random_state=42)
# Train the model
rf_classifier.fit(X_train_resampled, y_train_resampled)

# df_samples = your dataframe containing features and target variable
X_val = df_validation[selected_features]
# X_val = df_validation[X_features]
y_val = df_validation['Gender_mapped']

# Predict on the test set
y_pred = rf_classifier.predict(X_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:")
print(classification_report(y_val, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

##### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

# Search for the best model for under_rf_classifier
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the classifier
rf_classifier = RandomForestClassifier(class_weight='balanced', random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, n_jobs=-1, scoring='f1_macro')

# Fit the model
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best estimator
best_rf_classifier = grid_search.best_estimator_

# Train the model with the best estimator
best_rf_classifier.fit(X_train_resampled, y_train_resampled)

# RandomForestClassifier(class_weight='balanced', max_depth=20,
#                        min_samples_leaf=4, random_state=42)

##### Cross Validation

In [None]:
# Perform cross-validation on the best model
cv_scores = cross_val_score(best_rf_classifier, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')

# Output the cross-validation scores and the mean score
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean():.4f}")

# Perform cross-validation and calculate F1 score
f1_scores = cross_val_score(best_rf_classifier, X_train_resampled, y_train_resampled, cv=5, scoring='f1')

# Output the cross-validation F1 scores and the mean F1 score
print(f"Cross-validation F1 scores: {f1_scores}")
print(f"Mean cross-validation F1 score: {f1_scores.mean():.4f}")

##### Validation

In [None]:
# df_samples = your dataframe containing features and target variable
X_val = df_validation[selected_features]
# X_val = df_validation[X_features]
y_val = df_validation['Gender_mapped']

# Predict on the test set
y_pred = best_rf_classifier.predict(X_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:")
print(classification_report(y_val, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

### Sampling method

#### Under Sampling

In [None]:
# Under-sampling
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Assuming df is your dataframe containing features and target variable
X_train = df[selected_features]
y_train = df['Gender']

num_features = len(X_train.columns)

print('Number of Features:', num_features)

# Check the distribution of male and female in the training set
print("Distribution in y_train:")
print(y_train.value_counts())

# Handle the class imbalance using RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# Check the distribution of male and female in the resampled training set
print("Distribution in y_train_resampled:")
print(y_train_resampled.value_counts())

# Initialize the RandomForestClassifier
under_rf_classifier = RandomForestClassifier(class_weight='balanced', max_depth=30,
                       min_samples_split=10, n_estimators=300, random_state=42)

# Train the model
under_rf_classifier.fit(X_train_resampled, y_train_resampled)

# df_samples = your dataframe containing features and target variable
X_val = df_validation[selected_features]
# X_val = df_validation[X_features]
y_val = df_validation['Gender']

# Predict on the test set
y_pred = under_rf_classifier.predict(X_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:")
print(classification_report(y_val, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

## Threshold Selection

### ROC Curve

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

In [None]:
# Our trained model
# under_rf_classifier

# Predict probabilities for the validation or test set
y_prob = under_rf_classifier.predict_proba(X_val)[:, 1]  # Probabilities for the positive class
y_prob

In [None]:
y_val_mapped = y_val.map({'F': 0, 'M': 1})

# Compute ROC curve and ROC area for each class
fpr, tpr, thresholds = roc_curve(y_val_mapped, y_prob)

# Predict probabilities for the positive class
y_prob = under_rf_classifier.predict_proba(X_val)[:, 1]  # Probabilities for the 'M' class

# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_val_mapped, y_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_val_mapped, y_prob)

# Compute the Youden's J statistic for each threshold
j_scores = tpr - fpr

# Find the threshold that maximizes the J statistic
best_threshold_index = j_scores.argmax()
best_threshold = thresholds[best_threshold_index]

print(f'Best Threshold: {best_threshold}')

### KDE Plot

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from scipy.optimize import brentq

In [None]:
y_true = df_validation['Gender_mapped']

# Separate the data based on the actual labels (female=0, male=1)
female_probs = y_prob[y_true == 0]
male_probs = y_prob[y_true == 1]

plt.figure(figsize=(12, 8))

# KDE plot for females (label = 0)
sns.kdeplot(female_probs, fill=True, color='skyblue', label='Female', bw_adjust=1.0)

# KDE plot for males (label = 1)
sns.kdeplot(male_probs, fill=True, color='orange', label='Male', bw_adjust=1.0)

# Labeling and formatting
plt.xlabel('Predicted Probability')
plt.ylabel('Density')
plt.title('KDE Plot of Predicted Probabilities by Gender')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Search for the intersection point

# Generate KDEs for both classes
kde_0 = stats.gaussian_kde(female_probs)
kde_1 = stats.gaussian_kde(male_probs)

# Define the range of values where the intersection might occur
x = np.linspace(0, 1, 1000)

# Get the KDE values for both classes
kde_0_vals = kde_0(x)
kde_1_vals = kde_1(x)

# Find the intersection points
def find_intersection(kde1, kde2, x_range):
    return brentq(lambda x: kde1(x) - kde2(x), x_range[0], x_range[-1])

intersection = find_intersection(kde_0, kde_1, [0, 1])

print(f'Intersection Point (Threshold): {intersection:.4f}')

In [None]:
# KDE Plot with decision threshold
y_true = df_validation['Gender_mapped']

# Separate the data based on the actual labels (female=0, male=1)
female_probs = y_prob[y_true == 0]
male_probs = y_prob[y_true == 1]

plt.figure(figsize=(12, 8))

# KDE plot for females (label = 0)
sns.kdeplot(female_probs, fill=True, color='skyblue', label='Female', bw_adjust=1.0)

# KDE plot for males (label = 1)
sns.kdeplot(male_probs, fill=True, color='orange', label='Male', bw_adjust=1.0)

# Plot the decision threshold
plt.axvline(x=0.47, color='black', linestyle='--', label='Decision Threshold (0.47)')

# Labeling and formatting
plt.xlabel('Predicted Probability')
plt.ylabel('Density')
plt.title('KDE Plot of Predicted Probabilities by Gender')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

## Final RF Model

After feature selection and threshold selection, predict again on the val set using the final model

In [None]:
# Validation data
X_val = df_validation[selected_features]
y_val = df_validation['Gender_mapped']

# Predict the probabilities on the validation set
y_pred_proba = under_rf_classifier.predict_proba(X_val)[:, 1]  # Probability of the positive class (e.g., 'Male')

# Set the custom threshold of 0.47
threshold = 0.47
y_pred_final = (y_pred_proba >= threshold).astype(int)  # Convert probabilities to binary predictions based on threshold

# Evaluate the model with the custom threshold
print("Accuracy with threshold 0.47:", accuracy_score(y_val, y_pred_final))
print("Classification Report with threshold 0.47:")
print(classification_report(y_val, y_pred_final))
print("Confusion Matrix with threshold 0.47:")
print(confusion_matrix(y_val, y_pred_final))

## Model Evaluation

### SHAP

In [None]:
import shap

# Assuming 'under_rf_classifier' is your trained model and 'X_val' is your validation set
explainer = shap.TreeExplainer(under_rf_classifier)

# Calculate SHAP values for the validation set
shap_values = explainer.shap_values(X_val)

shap_values_class_1 = shap_values[:, :, 1]

In [None]:
# Plot SHAP beeswarm plot for class 1 (male) with top 16 features
shap.summary_plot(shap_values_class_1, X_val, feature_names=selected_features, plot_type="dot", max_display=16)
# plt.title('Top 16 SHAP Values for Male Class (Class 1)')

In [None]:
import numpy as np
import pandas as pd

# Calculate mean absolute SHAP values for each feature (assuming binary classification)
mean_abs_shap_values_class_0 = np.mean(np.abs(shap_values[:,:,0]), axis=0)
mean_abs_shap_values_class_1 = np.mean(np.abs(shap_values[:,:,1]), axis=0)

# Average SHAP values across both classes
mean_abs_shap_values = (mean_abs_shap_values_class_0 + mean_abs_shap_values_class_1) / 2

# Create a DataFrame to display feature importance
feature_importance_df = pd.DataFrame({
    'Feature': X_val.columns,
    'Mean Absolute SHAP Value': mean_abs_shap_values
})

# Sort by mean absolute SHAP value
feature_importance_df = feature_importance_df.sort_values(by='Mean Absolute SHAP Value', ascending=False)

# Display the top significant features
print("Top Significant Features Based on SHAP Values:")
print(feature_importance_df.head(16))

In [None]:
shap_features = feature_importance_df['Feature'][:16].tolist()

In [None]:
feature_importance_df.to_csv('shap_feature_importance.csv')

In [None]:
# Calculate cumulative importance
feature_importance_df['Cumulative Importance'] = feature_importance_df['Mean Absolute SHAP Value'].cumsum() / feature_importance_df['Mean Absolute SHAP Value'].sum()

# Determine number of features to reach 80% cumulative importance
threshold = 0.80
n_top_features = feature_importance_df[feature_importance_df['Cumulative Importance'] <= threshold].shape[0]

print(f"Number of features required to reach {threshold*100}% cumulative importance: {n_top_features}")

## Error Analysis

### Confusion Matrix

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import matplotlib.colors as mcolors

# Assume y_val_mapped and y_final_pred are defined
cm = confusion_matrix(y_val, y_pred_final)
labels = ['Female', 'Male']

# Normalize the confusion matrix to percentages
conf_matrix_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# Create the figure and axis
plt.figure(figsize=(8, 8))

custom_cmap = mcolors.LinearSegmentedColormap.from_list("custom_blues", ['#deebf7', '#2171b5', '#08306b'], N=256)

# Create the heatmap with square cells
ax = sns.heatmap(conf_matrix_normalized, annot=False, fmt='.2%', cmap=custom_cmap, xticklabels=labels, yticklabels=labels, square=True)

# Annotate the heatmap with both percentages, counts, and classification labels (TP, TN, FP, FN)
label_annotations = [['TN', 'FP'], ['FN', 'TP']]  # Define the labels

for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        color = 'white' if i == j else 'black'  # Use white text for TN and TP
        # Add a separate text for the label (TN, TP, FN, FP) with bold and larger font
        plt.text(
            j + 0.5, 
            i + 0.4, 
            f'{label_annotations[i][j]}',
            ha='center', 
            va='center',  # Align the label at the top of the cell
            color=color, 
            fontsize=14,  # Increase font size for the label
            fontweight='bold'  # Make the label bold
        )
        plt.text(
            j + 0.5, 
            i + 0.5, 
            f'\n{conf_matrix_normalized[i, j]:.1%}\n(n={cm[i, j]:,})',
            ha='center', 
            va='center', 
            color=color, 
            fontsize=12  # General font size for text
        )


# Retrieve the color bar associated with the heatmap
colorbar = ax.collections[0].colorbar

# Update the color bar to show percentages
colorbar.set_ticks([0.3, 0.4, 0.5, 0.6, 0.7])  # Set tick positions in terms of the normalized values
colorbar.set_ticklabels(['30%', '40%', '50%', '60%', '70%'])  # Set the labels to percentage format

# Add labels and title
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.title('Confusion Matrix Heatmap')

# Retrieve the color patches from the heatmap
patches = ax.collections[0].get_paths()

# Extract the color for each cell
colors = [ax.collections[0].get_facecolor()[i] for i in range(len(patches))]

# Map the colors to the corresponding labels
color_map = {
    'TN': colors[0],  # Top-left
    'FP': colors[1],  # Top-right
    'FN': colors[2],  # Bottom-left
    'TP': colors[3]   # Bottom-right
}

# Custom legend-like explanation with square markers using the extracted colors
legend_elements = [
    plt.Line2D([0], [0], color=color_map['TN'], marker='s', markersize=10, linestyle='None', label='TN: True Negative\n(Females correctly classified as Females)'),
    plt.Line2D([0], [0], color=color_map['FP'], marker='s', markersize=10, linestyle='None', label='FP: False Positive\n(Females incorrectly classified as Males)'),
    plt.Line2D([0], [0], color=color_map['FN'], marker='s', markersize=10, linestyle='None', label='FN: False Negative\n(Males incorrectly classified as Females)'),
    plt.Line2D([0], [0], color=color_map['TP'], marker='s', markersize=10, linestyle='None', label='TP: True Positive\n(Males correctly classified as Males)')
]


# Position the custom legend to the right of the plot
plt.figlegend(handles=legend_elements, loc='center left', bbox_to_anchor=(0.9, 0.5), borderaxespad=0., frameon=False, labelspacing=2)

# Adjust layout to prevent clipping
plt.tight_layout(rect=[0, 0, 0,0])  # Adjust the layout to leave space for the legend

plt.savefig('cm_heatmap.png', dpi=300, bbox_inches='tight')

# Display the plot
plt.show()

### Plot of important features

In [None]:
# Get the predicted classes (assuming the model was trained for multi-class classification)
predicted_classes = y_final_pred

df_validation_class = df_validation.copy()

# Add these classified classes back into your DataFrame if you want
df_validation_class['Predicted Gender'] = predicted_classes


# Define a function to classify each instance into one of the 4 classes
def classify_confusion_matrix(actual, predicted):
    if actual == 1 and predicted == 1:
        return 'TP'  # TP
    elif actual == 0 and predicted == 0:
        return 'TN'  # TN
    elif actual == 0 and predicted == 1:
        return 'FP' # FP
    elif actual == 1 and predicted == 0:
        return 'FN' # FN
    else:
        return 'Unknown'  # Just in case there's some data issue

# Apply the function across the DataFrame
df_validation_class['Confusion Matrix Class'] = df_validation_class.apply(
    lambda row: classify_confusion_matrix(row['Gender_mapped'], row['Predicted Gender']), axis=1
)

# Output the classified groups
# True Positive (TP): Actual Male (1) and Predicted Male (1)
df_validation_tp = df_validation_class[df_validation_class['Confusion Matrix Class'] == 'TP']

# True Negative (TN): Actual Female (0) and Predicted Female (0)
df_validation_tn = df_validation_class[df_validation_class['Confusion Matrix Class'] == 'TN']

# False Positive (FP): Actual Female (0) but Predicted Male (1)
df_validation_fp = df_validation_class[df_validation_class['Confusion Matrix Class'] == 'FP']

# False Negative (FN): Actual Male (1) but Predicted Female (0)
df_validation_fn = df_validation_class[df_validation_class['Confusion Matrix Class'] == 'FN']


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

df_validation_class_standardised = df_validation_class.copy()

# # Normalize the features (scales all features to [0, 1])
# scaler = MinMaxScaler(feature_range=(0, 1))
# df_validation_class_normalised[most_important_features] = scaler.fit_transform(df_validation_class[most_important_features])

# OR Standardize the features (scales to mean=0 and std=1)
scaler = StandardScaler()
df_validation_class_standardised[shap_features] = scaler.fit_transform(df_validation_class[shap_features])

# Assuming df_validation_class is your DataFrame and it has a 'Confusion Matrix Class' column
# Also assuming your features are stored in a separate list `feature_columns`
mean_values = {}

for label in ['TP', 'TN', 'FP', 'FN']:
    mean_values[label] = df_validation_class_standardised[df_validation_class_standardised['Confusion Matrix Class'] == label][shap_features].mean()

mean_df = pd.DataFrame(mean_values)

# Calculate median values for all positives (true and false positives)
mean_positives = df_validation_class_standardised[df_validation_class_standardised['Gender_mapped'] == 1][shap_features].mean()

# Calculate median values for all negatives (true and false negatives)
mean_negatives = df_validation_class_standardised[df_validation_class_standardised['Gender_mapped'] == 0][shap_features].mean()

In [None]:
import matplotlib.pyplot as plt

# Transpose the DataFrame so features are on the x-axis
mean_df_transposed = mean_df.T

plt.figure(figsize=(22, 10))

# Define the categories you want to plot
categories_to_plot = ['TP', 'TN', 'FP', 'FN']  # Female vs Male classified as Female

# Define line styles for each category
line_styles = {
    'TN': '-',  # Solid line for True Negative
    'TP': '-',  # Solid line for True Positive
    'FN': '--', # Dashed line for False Negative
    'FP': '--'  # Dashed line for False Positive
}

# Define colors for each category
colors = {
    'TN': 'red',    # Blue for True Negative
    'TP': 'green',   # Green for True Positive
    'FN': 'orange',     # Red for False Negative
    'FP': 'royalblue'   # Orange for False Positive
}

# Define alpha values for each category
alphas = {
    'TN': 1,
    'TP': 1,
    'FN': 1,
    'FP': 1
}

# Plotting the medians for each category with the specified line style and color
for category in categories_to_plot:
    plt.plot(mean_df_transposed.columns, mean_df_transposed.loc[category], 
             marker='o', linestyle=line_styles[category], color=colors[category], label=category, alpha=alphas[category])

# Increase font size for labels, title, and legend
plt.xlabel('Features', fontsize=20)
plt.ylabel('Mean Standardised Feature Value', fontsize=20)
# plt.title('Mean Standardised Feature Values of All Classes', fontsize=22)
plt.xticks(rotation=45, ha='right', fontsize=16)  # Rotate feature names if necessary
plt.yticks(fontsize=16)  # Set fontsize for y-axis ticks
plt.legend(fontsize=16)  # Set fontsize for the legend
plt.tight_layout()  # Adjusts layout to prevent clipping of tick-labels
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Transpose the DataFrame so features are on the x-axis
mean_df_transposed = mean_df.T

plt.figure(figsize=(22, 10))

# Define the categories you want to plot
categories_to_plot = ['TP', 'TN', 'FP', 'FN']  # Female vs Male classified as Female

# Define line styles for each category
line_styles = {
    'TN': '-',  # Solid line for True Negative
    'TP': '-',  # Solid line for True Positive
    'FN': '--', # Dashed line for False Negative
    'FP': '--'  # Dashed line for False Positive
}

# Define colors for each category
colors = {
    'TN': 'red',    # Blue for True Negative
    'TP': 'green',   # Green for True Positive
    'FN': 'orange',     # Red for False Negative
    'FP': 'royalblue'   # Orange for False Positive
}

# Define alpha values for each category
alphas = {
    'TN': 1,
    'TP': 0.15,
    'FN': 1,
    'FP': 0.15
}

# Plotting the medians for each category with the specified line style and color
for category in categories_to_plot:
    plt.plot(mean_df_transposed.columns, mean_df_transposed.loc[category], 
             marker='o', linestyle=line_styles[category], color=colors[category], label=category, alpha=alphas[category])


# Create a legend with only TN and FN
handles, labels = plt.gca().get_legend_handles_labels()
selected_handles_labels = [(h, l) for h, l in zip(handles, labels) if l in ['TN', 'FN']]
if selected_handles_labels:
    handles, labels = zip(*selected_handles_labels)
    plt.legend(handles, labels)

plt.xlabel('Features', fontsize=20)
plt.ylabel('Mean Standardised Feature Value', fontsize=20)
plt.xticks(rotation=45, ha='right', fontsize=16)  # Rotate feature names if necessary
plt.yticks(fontsize=16)  # Set fontsize for y-axis ticks
plt.legend(fontsize=16)  # Set fontsize for the legend
plt.tight_layout()  # Adjusts layout to prevent clipping of tick-labels
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Transpose the DataFrame so features are on the x-axis
mean_df_transposed = mean_df.T

plt.figure(figsize=(22, 10))

# Define the categories you want to plot
categories_to_plot = ['TP', 'TN', 'FP', 'FN']  # Female vs Male classified as Female

# Define line styles for each category
line_styles = {
    'TN': '-',  # Solid line for True Negative
    'TP': '-',  # Solid line for True Positive
    'FN': '--', # Dashed line for False Negative
    'FP': '--'  # Dashed line for False Positive
}

# Define colors for each category
colors = {
    'TN': 'red',    # Blue for True Negative
    'TP': 'green',   # Green for True Positive
    'FN': 'orange',     # Red for False Negative
    'FP': 'royalblue'   # Orange for False Positive
}

# Define alpha values for each category
alphas = {
    'TN': 0.15,
    'TP': 1,
    'FN': 0.15,
    'FP': 1
}

# Plotting the medians for each category with the specified line style and color
for category in categories_to_plot:
    plt.plot(mean_df_transposed.columns, mean_df_transposed.loc[category], 
             marker='o', linestyle=line_styles[category], color=colors[category], label=category, alpha=alphas[category])

# Create a legend with only TN and FN
handles, labels = plt.gca().get_legend_handles_labels()
selected_handles_labels = [(h, l) for h, l in zip(handles, labels) if l in ['TP', 'FP']]
if selected_handles_labels:
    handles, labels = zip(*selected_handles_labels)
    plt.legend(handles, labels)

plt.xlabel('Features', fontsize=20)
plt.ylabel('Mean Standardised Feature Value', fontsize=20)
plt.xticks(rotation=45, ha='right', fontsize=16)  # Rotate feature names if necessary
plt.yticks(fontsize=16)  # Set fontsize for y-axis ticks
plt.legend(fontsize=16)  # Set fontsize for the legend
plt.tight_layout()  # Adjusts layout to prevent clipping of tick-labels
plt.show()

In [None]:
tp_feature = get_stat(df_validation_tp, shap_features)
tn_feature = get_stat(df_validation_tn, shap_features)
fp_feature = get_stat(df_validation_fp, shap_features)
fn_feature = get_stat(df_validation_fn, shap_features)

In [None]:
tp_feature.applymap(format_number)
tn_feature.applymap(format_number)
fp_feature.applymap(format_number)

## Other Models

### Lasso Regression

In [None]:
coefficients

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso

# Normalize the features
scaler = StandardScaler()
X_train_resampled_normalized = scaler.fit_transform(X_train_resampled)

# Convert target variable 'Gender' to numeric
y_train_resampled_encoded = y_train_resampled.map({'F': 0, 'M': 1})

# Initialize the Lasso model with a chosen alpha (regularization strength)
lasso = LogisticRegression(C=10, penalty='l1', random_state=42, solver='liblinear')

# Train the model
lasso.fit(X_train_resampled_normalized, y_train_resampled_encoded)

# Predict and evaluate
# df_samples = your dataframe containing features and target variable
X_val = df_validation.drop(features_exclude, axis=1)
X_val_normalized = scaler.fit_transform(X_val)
y_val = df_validation['Gender']
y_val_encoded = y_val.map({'F': 0, 'M': 1})

# Predict on the test set
y_val_pred = lasso.predict(X_val_normalized)

# Since Lasso is a regression model, we need to convert predictions to binary outcomes
# A common approach is to use a threshold of 0.5
y_val_pred_binary = (y_val_pred > 0.5).astype(int)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val_encoded, y_val_pred_binary))
print("Classification Report:\n", classification_report(y_val_encoded, y_val_pred_binary))
print("Confusion Matrix:\n", confusion_matrix(y_val_encoded, y_val_pred_binary))

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the Lasso model
lasso = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)
lasso.fit(X_train_resampled_normalized, y_train_resampled_encoded)

# Predict probabilities (output between 0 and 1)
y_pred_proba = lasso.predict_proba(X_val_normalized)[:, 1]

# Add predicted probabilities to the test set for analysis
test_results = X_val.copy()
test_results['Actual'] = y_val_encoded
test_results['Predicted_Prob'] = y_pred_proba

print(test_results.head())

In [None]:
# Initialize and train the Lasso model
lasso = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)
lasso.fit(X_train_resampled_normalized, y_train_resampled_encoded)

# Predict on the test set
y_val_pred = lasso.predict(X_val_normalized)

# Since Lasso is a regression model, we need to convert predictions to binary outcomes
# A common approach is to use a threshold of 0.5
y_val_pred_binary = (y_val_pred > 0.5).astype(int)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val_encoded, y_val_pred_binary))
print("Classification Report:\n", classification_report(y_val_encoded, y_val_pred_binary))
print("Confusion Matrix:\n", confusion_matrix(y_val_encoded, y_val_pred_binary))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define the parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
    'penalty': ['l1'],  # Lasso (L1) penalty
    'solver': ['liblinear']  # Solver that supports L1 regularization
}

# Initialize the logistic regression model
logistic = LogisticRegression(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(logistic, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train_resampled_normalized, y_train_resampled_encoded)

# Best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Train the model with the best parameters
best_logistic = grid_search.best_estimator_
best_logistic.fit(X_train_resampled_normalized, y_train_resampled_encoded)

#### Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Perform cross-validation on the best model
cv_scores = cross_val_score(best_logistic, X_train_resampled_normalized, y_train_resampled_encoded, cv=5, scoring='accuracy')

# Output the cross-validation scores and the mean score
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean():.4f}")

# Perform cross-validation and calculate F1 score
f1_scores = cross_val_score(best_logistic, X_train_resampled_normalized, y_train_resampled_encoded, cv=5, scoring='f1')

# Output the cross-validation F1 scores and the mean F1 score
print(f"Cross-validation F1 scores: {f1_scores}")
print(f"Mean cross-validation F1 score: {f1_scores.mean():.4f}")

#### Validation

In [None]:
# Predict on the test set
y_val_pred = best_logistic.predict(X_val_normalized)

# Since Lasso is a regression model, we need to convert predictions to binary outcomes
# A common approach is to use a threshold of 0.5
y_val_pred_binary = (y_val_pred > 0.5).astype(int)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val_encoded, y_val_pred_binary))
print("Classification Report:\n", classification_report(y_val_encoded, y_val_pred_binary))
print("Confusion Matrix:\n", confusion_matrix(y_val_encoded, y_val_pred_binary))

#### Coefficients

In [None]:
# Get the coefficients
coefficients = best_logistic.coef_[0]

# Create a DataFrame for better visualization
feature_importances = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': coefficients})
feature_importances = feature_importances.sort_values(by='Coefficient', ascending=False)

# Display the coefficients
print("Lasso Regression Coefficients:")
print(feature_importances)

In [None]:
import pandas as pd

# Assuming X_train and y_train are your training data features and labels
X['Gender'] = y  # Add the Gender column back to the features for analysis

# Calculate summary statistics for Formality Score
formality_stats = X.groupby('Gender')['F10 Formality Score'].describe()

# Display summary statistics
print("Summary Statistics for Formality Score:")
print(formality_stats)

In [None]:
from sklearn.preprocessing import StandardScaler

# Extract the formality score and gender columns
formality_scores = X[['F10 Formality Score', 'Gender']]

# Standardize the formality scores
scaler = StandardScaler()
formality_scores['F10 Formality Score Standardized'] = scaler.fit_transform(formality_scores[['F10 Formality Score']])

# Calculate summary statistics for the standardized formality scores
standardized_stats = formality_scores.groupby('Gender')['F10 Formality Score Standardized'].describe()

# Display summary statistics
print("Summary Statistics for Standardized Formality Score:")
print(standardized_stats)

### Gradient Boosting Machines

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error


# Example for classification
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

gbm.fit(X_train_resampled, y_train_resampled)

y_pred = gbm.predict(X_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:")
print(classification_report(y_val, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))


#### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 4, 5],
    'subsample': [0.7, 0.8, 1.0],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5]
}

gbm = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train_resampled, y_train_resampled)

print(f"Best parameters found: {grid_search.best_params_}")

# Best parameters found: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 0.7}

#### Cross validation

In [None]:
from sklearn.model_selection import cross_validate

# Perform cross-validation and get multiple scores
cv_results = cross_validate(gbm_best, X_train_resampled, y_train_resampled, cv=5, 
                            scoring=['accuracy', 'f1'], return_train_score=False)

# Output the cross-validation scores
print(f"Cross-validation accuracy scores: {cv_results['test_accuracy']}")
print(f"Mean cross-validation accuracy score: {cv_results['test_accuracy'].mean():.4f}")

print(f"Cross-validation F1 scores: {cv_results['test_f1']}")
print(f"Mean cross-validation F1 score: {cv_results['test_f1'].mean():.4f}")

#### Validation

In [None]:
gbm_best = GradientBoostingClassifier(
    learning_rate=0.1, 
    max_depth=5, 
    min_samples_leaf=5, 
    min_samples_split=2, 
    n_estimators=100, 
    subsample=0.7,
    random_state=42
)

gbm_best.fit(X_train_resampled, y_train_resampled)

y_pred = gbm_best.predict(X_val)

# Evaluate the model
print('Gradient Boosting Classifier')
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:")
print(classification_report(y_val, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))