In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import pickle

# Load the dataset
data_path = 'DocAssist Dataset.xlsx'
data = pd.read_excel(data_path, sheet_name='in')

# Preprocessing: Handle missing values for numeric columns only
numeric_columns = data.select_dtypes(include=['number']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Selecting features and target
features = [
    'HAEMATOCRIT', 'HAEMOGLOBINS', 'ERYTHROCYTE', 'LEUCOCYTE', 
    'THROMBOCYTE', 'MCH', 'MCHC', 'MCV', 'AGE'
]
target = 'SOURCE'  # Assuming SOURCE is the target column

X = data[features]
y = data[target]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Save the trained model to a file
model_file_path = 'random_forest_model.pkl'
with open(model_file_path, 'wb') as model_file:
    pickle.dump(rf_model, model_file)
print(f"Model saved to {model_file_path}")

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Add recommendations based on predictions
def recommend(source):
    if source == 1:
        return "Maintain a balanced diet and consult a doctor for regular checkups."
    elif source == 0:
        return "Your health parameters are stable. Continue with your current lifestyle."
    else:
        return "Consider a detailed medical examination for potential issues."

# Apply the model to the entire dataset
data['Predicted_SOURCE'] = rf_model.predict(X_scaled)
data['Recommendation'] = data['Predicted_SOURCE'].apply(recommend)


Model saved to random_forest_model.pkl
Accuracy: 73.41%

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.86      0.79       392
           1       0.73      0.55      0.63       270

    accuracy                           0.73       662
   macro avg       0.73      0.71      0.71       662
weighted avg       0.73      0.73      0.73       662



In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import pickle

# Load the dataset
data_path = 'DocAssist Dataset.xlsx'
data = pd.read_excel(data_path, sheet_name='in')

# Preprocessing: Handle missing values for numeric columns only
numeric_columns = data.select_dtypes(include=['number']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Selecting features and target
features = [
    'HAEMATOCRIT', 'HAEMOGLOBINS', 'ERYTHROCYTE', 'LEUCOCYTE', 
    'THROMBOCYTE', 'MCH', 'MCHC', 'MCV', 'AGE'
]
target = 'SOURCE'  # Assuming SOURCE is the target column

X = data[features]
y = data[target]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save the scaler to a file
scaler_file_path = 'scaler.pkl'
with open(scaler_file_path, 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)
print(f"Scaler saved to {scaler_file_path}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Save the trained model to a file
model_file_path = 'random_forest_model.pkl'
with open(model_file_path, 'wb') as model_file:
    pickle.dump(rf_model, model_file)
print(f"Model saved to {model_file_path}")

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Add recommendations based on predictions
def recommend(source):
    if source == 1:
        return "Maintain a balanced diet and consult a doctor for regular checkups."
    elif source == 0:
        return "Your health parameters are stable. Continue with your current lifestyle."
    else:
        return "Consider a detailed medical examination for potential issues."

# Apply the model to the entire dataset
data['Predicted_SOURCE'] = rf_model.predict(X_scaled)
data['Recommendation'] = data['Predicted_SOURCE'].apply(recommend)

# Save sample testing and prediction to validate loading
sample_input = np.array([[40.5, 13.5, 4.7, 7.2, 250, 28.5, 32.5, 85.0, 35]])  # Example values
sample_input_scaled = scaler.transform(sample_input)
sample_prediction = rf_model.predict(sample_input_scaled)
sample_output = recommend(sample_prediction[0])

print("Sample Input Prediction:", sample_prediction[0])
print("Sample Recommendation:", sample_output)


Scaler saved to scaler.pkl
Model saved to random_forest_model.pkl
Accuracy: 73.41%

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.86      0.79       392
           1       0.73      0.55      0.63       270

    accuracy                           0.73       662
   macro avg       0.73      0.71      0.71       662
weighted avg       0.73      0.73      0.73       662

Sample Input Prediction: 0
Sample Recommendation: Your health parameters are stable. Continue with your current lifestyle.


In [4]:
!pip install scikit-learn==1.2.2
!pip install xgboost==1.7.5


Collecting scikit-learn==1.2.2
  Downloading scikit_learn-1.2.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Downloading scikit_learn-1.2.2-cp311-cp311-win_amd64.whl (8.3 MB)
   ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
   --- ------------------------------------ 0.8/8.3 MB 5.6 MB/s eta 0:00:02
   ---------- ----------------------------- 2.1/8.3 MB 5.3 MB/s eta 0:00:02
   --------------- ------------------------ 3.1/8.3 MB 5.6 MB/s eta 0:00:01
   -------------------- ------------------- 4.2/8.3 MB 5.2 MB/s eta 0:00:01
   ---------------------- ----------------- 4.7/8.3 MB 5.1 MB/s eta 0:00:01
   -------------------------- ------------- 5.5/8.3 MB 4.6 MB/s eta 0:00:01
   ------------------------------ --------- 6.3/8.3 MB 4.3 MB/s eta 0:00:01
   ------------------------------- -------- 6.6/8.3 MB 4.2 MB/s eta 0:00:01
   ----------------------------------- ---- 7.3/8.3 MB 3.9 MB/s eta 0:00:01
   -------------------------------------- - 7.9/8.3 MB 4.0 MB/s eta 0:00:0

  You can safely remove it manually.
  You can safely remove it manually.


Collecting xgboost==1.7.5
  Downloading xgboost-1.7.5-py3-none-win_amd64.whl.metadata (1.9 kB)
Downloading xgboost-1.7.5-py3-none-win_amd64.whl (70.9 MB)
   ---------------------------------------- 0.0/70.9 MB ? eta -:--:--
   ---------------------------------------- 0.8/70.9 MB 5.6 MB/s eta 0:00:13
    --------------------------------------- 1.0/70.9 MB 4.6 MB/s eta 0:00:16
   - -------------------------------------- 2.1/70.9 MB 3.7 MB/s eta 0:00:19
   - -------------------------------------- 2.9/70.9 MB 3.7 MB/s eta 0:00:19
   -- ------------------------------------- 3.7/70.9 MB 3.9 MB/s eta 0:00:18
   -- ------------------------------------- 4.5/70.9 MB 3.8 MB/s eta 0:00:18
   -- ------------------------------------- 5.2/70.9 MB 4.0 MB/s eta 0:00:17
   --- ------------------------------------ 5.8/70.9 MB 3.8 MB/s eta 0:00:18
   --- ------------------------------------ 6.3/70.9 MB 3.5 MB/s eta 0:00:19
   --- ------------------------------------ 6.6/70.9 MB 3.4 MB/s eta 0:00:20
   ---

In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Load the dataset
data_path = 'DocAssist Dataset.xlsx'
data = pd.read_excel(data_path, sheet_name='in')

# Preprocessing: Handle missing values for numeric columns only
numeric_columns = data.select_dtypes(include=['number']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Selecting features and target
features = [
    'HAEMATOCRIT', 'HAEMOGLOBINS', 'ERYTHROCYTE', 'LEUCOCYTE', 
    'THROMBOCYTE', 'MCH', 'MCHC', 'MCV', 'AGE'
]
target = 'SOURCE'  # Assuming SOURCE is the target column

X = data[features]
y = data[target]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for Random Forest Classifier
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model from grid search
best_rf_model = grid_search.best_estimator_

# Train the best model
best_rf_model.fit(X_train, y_train)

# Make predictions
y_pred = best_rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Add recommendations based on predictions
def recommend(source):
    if source == 1:
        return "Maintain a balanced diet and consult a doctor for regular checkups."
    elif source == 0:
        return "Your health parameters are stable. Continue with your current lifestyle."
    else:
        return "Consider a detailed medical examination for potential issues."

# Apply the model to the entire dataset
data['Predicted_SOURCE'] = best_rf_model.predict(X_scaled)
data['Recommendation'] = data['Predicted_SOURCE'].apply(recommend)


Accuracy: 73.72%

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.87      0.80       392
           1       0.74      0.54      0.63       270

    accuracy                           0.74       662
   macro avg       0.74      0.71      0.71       662
weighted avg       0.74      0.74      0.73       662



In [9]:
!pip install imbalanced-learn


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting scikit-learn<2,>=1.3.2 (from imbalanced-learn)
  Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   -- ------------------------------------- 0.8/11.1 MB 4.2 MB/s eta 0:00:03
   ----- ---------------------------------- 1.6/11.1 MB 4.0 MB/s eta 0:00:03
   -------- ------------------------------- 2.4/11.1 MB 3.9 MB/s eta 0:00:03
   --------- ------------------------------ 2.6/11.1 MB 4.1 MB/s eta 0:00:03
   ------------- -------------------------- 3.7/11.1 MB 3.7 MB/s eta 0:00:03
   ---------------- ----------------------- 4.5/11.1 M

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

# Load the dataset
data_path = 'DocAssist Dataset.xlsx'
data = pd.read_excel(data_path, sheet_name='in')

# Preprocessing: Handle missing values for numeric columns only
numeric_columns = data.select_dtypes(include=['number']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Selecting features and target
features = [
    'HAEMATOCRIT', 'HAEMOGLOBINS', 'ERYTHROCYTE', 'LEUCOCYTE', 
    'THROMBOCYTE', 'MCH', 'MCHC', 'MCV', 'AGE'
]
target = 'SOURCE'  # Assuming SOURCE is the target column

X = data[features]
y = data[target]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Apply SMOTE for over-sampling the minority class
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Hyperparameter tuning for XGBoost Classifier
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 6, 10],
    'subsample': [0.7, 0.8, 1.0]
}

xgb_model = XGBClassifier(random_state=42)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train_resampled, y_train_resampled)

# Best model from grid search
best_xgb_model = grid_search.best_estimator_

# Train the best model
best_xgb_model.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred = best_xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Add recommendations based on predictions
def recommend(source):
    if source == 1:
        return "Maintain a balanced diet and consult a doctor for regular checkups."
    elif source == 0:
        return "Your health parameters are stable. Continue with your current lifestyle."
    else:
        return "Consider a detailed medical examination for potential issues."

# Apply the model to the entire dataset
data['Predicted_SOURCE'] = best_xgb_model.predict(X_scaled)
data['Recommendation'] = data['Predicted_SOURCE'].apply(recommend)


ModuleNotFoundError: No module named 'sklearn.neighbors'

In [9]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import numpy as np

# Load the dataset
data_path = 'DocAssist Dataset.xlsx'
data = pd.read_excel(data_path, sheet_name='in')

# Preprocessing: Handle missing values for numeric columns only
numeric_columns = data.select_dtypes(include=['number']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Selecting features and target
features = [
    'HAEMATOCRIT', 'HAEMOGLOBINS', 'ERYTHROCYTE', 'LEUCOCYTE', 
    'THROMBOCYTE', 'MCH', 'MCHC', 'MCV', 'AGE'
]
target = 'SOURCE'  # Assuming SOURCE is the target column

X = data[features]
y = data[target]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for Gradient Boosting Classifier
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

gb_model = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model from grid search
best_gb_model = grid_search.best_estimator_

# Train the best model
best_gb_model.fit(X_train, y_train)

# Make predictions
y_pred = best_gb_model.predict(X_test)
y_pred_prob = best_gb_model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"ROC-AUC Score: {roc_auc:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Add recommendations based on predictions
def recommend(source):
    if source == 1:
        return "Maintain a balanced diet and consult a doctor for regular checkups."
    elif source == 0:
        return "Your health parameters are stable. Continue with your current lifestyle."
    else:
        return "Consider a detailed medical examination for potential issues."

# Apply the model to the entire dataset
data['Predicted_SOURCE'] = best_gb_model.predict(X_scaled)
data['Recommendation'] = data['Predicted_SOURCE'].apply(recommend)


Accuracy: 72.21%
ROC-AUC Score: 0.76

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.84      0.78       392
           1       0.70      0.55      0.62       270

    accuracy                           0.72       662
   macro avg       0.72      0.69      0.70       662
weighted avg       0.72      0.72      0.71       662



In [12]:
import pandas as pd
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, f1_score
import numpy as np

# Load the dataset
data_path = 'DocAssist Dataset.xlsx'
data = pd.read_excel(data_path, sheet_name='in')

# Preprocessing: Handle missing values for numeric columns only
numeric_columns = data.select_dtypes(include=['number']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Selecting features and target
features = [
    'HAEMATOCRIT', 'HAEMOGLOBINS', 'ERYTHROCYTE', 'LEUCOCYTE', 
    'THROMBOCYTE', 'MCH', 'MCHC', 'MCV', 'AGE'
]
target = 'SOURCE'  # Assuming SOURCE is the target column

X = data[features]
y = data[target]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for HistGradientBoostingClassifier
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_iter': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

hgb_model = HistGradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(estimator=hgb_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model from grid search
best_hgb_model = grid_search.best_estimator_

# Train the best model
best_hgb_model.fit(X_train, y_train)

# Make predictions
y_pred = best_hgb_model.predict(X_test)
y_pred_prob = best_hgb_model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"ROC-AUC Score: {roc_auc:.2f}")
print(f"F1 Score: {f1:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Add recommendations based on predictions
def recommend(source):
    if source == 1:
        return "Maintain a balanced diet and consult a doctor for regular checkups."
    elif source == 0:
        return "Your health parameters are stable. Continue with your current lifestyle."
    else:
        return "Consider a detailed medical examination for potential issues."

# Apply the model to the entire dataset
data['Predicted_SOURCE'] = best_hgb_model.predict(X_scaled)
data['Recommendation'] = data['Predicted_SOURCE'].apply(recommend)


Accuracy: 73.56%
ROC-AUC Score: 0.77
F1 Score: 0.73

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.87      0.80       392
           1       0.74      0.54      0.63       270

    accuracy                           0.74       662
   macro avg       0.74      0.71      0.71       662
weighted avg       0.74      0.74      0.73       662



In [3]:
!pip uninstall scikit-learn

^C


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.utils import resample
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
import joblib  # To save the model and scaler

# Load the dataset
data_path = 'DocAssist Dataset.xlsx'
data = pd.read_excel(data_path, sheet_name='in')

# Preprocessing: Handle missing values for numeric columns only
numeric_columns = data.select_dtypes(include=['number']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Selecting features and target
features = [
    'HAEMATOCRIT', 'HAEMOGLOBINS', 'ERYTHROCYTE', 'LEUCOCYTE', 
    'THROMBOCYTE', 'MCH', 'MCHC', 'MCV', 'AGE'
]
target = 'SOURCE'  # Assuming SOURCE is the target column

X = data[features]
y = data[target]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Apply SMOTE for over-sampling the minority class
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Data duplication (oversampling the minority class by duplicating examples)
minority_class = data[data[target] == 1]
majority_class = data[data[target] == 0]

# Duplicate minority class data to further augment
minority_class_duplicated = resample(minority_class,
                                     replace=True,    # Sample with replacement
                                     n_samples=len(majority_class), # Match majority class size
                                     random_state=42)

# Combine the duplicated minority class with the majority class
data_duplicated = pd.concat([majority_class, minority_class_duplicated])

# Split again after duplication
X_duplicated = data_duplicated[features]
y_duplicated = data_duplicated[target]

# Standardize the features again
X_duplicated_scaled = scaler.fit_transform(X_duplicated)

# Split the data into training and testing sets again
X_train, X_test, y_train, y_test = train_test_split(X_duplicated_scaled, y_duplicated, test_size=0.2, random_state=42)

# Define base models for stacking
xgb_model = XGBClassifier(random_state=42, n_estimators=100)
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
svc_model = SVC(random_state=42, probability=True)
mlp_model = MLPClassifier(random_state=42, max_iter=500)

# Create a stacking model
stacking_model = StackingClassifier(estimators=[
    ('xgb', xgb_model),
    ('rf', rf_model),
    ('svc', svc_model),
    ('mlp', mlp_model)
], final_estimator=LogisticRegression())

# Randomized Search for hyperparameters
param_dist = {
    'xgb__learning_rate': [0.01, 0.05, 0.1],
    'xgb__max_depth': [3, 6, 10],
    'xgb__subsample': [0.7, 0.8, 1.0],
    'xgb__colsample_bytree': [0.7, 0.8, 1.0],
    'rf__max_depth': [5, 10, None],
    'rf__min_samples_split': [2, 5],
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf'],
    'mlp__hidden_layer_sizes': [(50,), (100,), (150,)],
    'mlp__activation': ['relu', 'tanh']
}

# Perform randomized search with 3-fold cross-validation
random_search = RandomizedSearchCV(estimator=stacking_model, param_distributions=param_dist, 
                                   n_iter=30, cv=3, n_jobs=-1, scoring='accuracy', random_state=42)

# Fit the model
random_search.fit(X_train, y_train)

# Best model from random search
best_model = random_search.best_estimator_

# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ROC-AUC Score for binary classification (if applicable)
if len(np.unique(y)) == 2:  # Ensure y has two unique classes for ROC-AUC calculation
    roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
    print(f"ROC-AUC Score: {roc_auc:.2f}")

# Adjust threshold if needed
threshold = 0.4  # Adjust this as needed
y_pred_prob = best_model.predict_proba(X_test)[:, 1]
y_pred_adjusted = (y_pred_prob > threshold).astype(int)
print(f"Adjusted Accuracy: {accuracy_score(y_test, y_pred_adjusted) * 100:.2f}%")
print("\nAdjusted Classification Report:\n", classification_report(y_test, y_pred_adjusted))

# Apply the best model to the entire dataset for predictions and recommendations
data['Predicted_SOURCE'] = best_model.predict(X_scaled)

def recommend(source):
    if source == 1:
        return "Maintain a balanced diet and consult a doctor for regular checkups."
    elif source == 0:
        return "Your health parameters are stable. Continue with your current lifestyle."
    else:
        return "Consider a detailed medical examination for potential issues."

data['Recommendation'] = data['Predicted_SOURCE'].apply(recommend)

# Optionally, save the dataset with recommendations to a new file
# data.to_excel('/kaggle/working/DocAssist_with_recommendations.xlsx', index=False)

# Save the model and scaler to files
joblib.dump(best_model, 'stacking_model.pkl')  # Save the best model
joblib.dump(scaler, 'scaler.pkl')  # Save the scaler

print("Model and scaler have been saved successfully!")




Accuracy: 87.70%

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.87      0.88       424
           1       0.86      0.89      0.87       373

    accuracy                           0.88       797
   macro avg       0.88      0.88      0.88       797
weighted avg       0.88      0.88      0.88       797

ROC-AUC Score: 0.94
Adjusted Accuracy: 86.20%

Adjusted Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.83      0.86       424
           1       0.82      0.90      0.86       373

    accuracy                           0.86       797
   macro avg       0.86      0.86      0.86       797
weighted avg       0.87      0.86      0.86       797

Model and scaler have been saved successfully!
