In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import pickle

# Load the dataset
data_path = 'DocAssist Dataset.xlsx'
data = pd.read_excel(data_path, sheet_name='in')

# Preprocessing: Handle missing values for numeric columns only
numeric_columns = data.select_dtypes(include=['number']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Selecting features and target
features = [
    'HAEMATOCRIT', 'HAEMOGLOBINS', 'ERYTHROCYTE', 'LEUCOCYTE', 
    'THROMBOCYTE', 'MCH', 'MCHC', 'MCV', 'AGE'
]
target = 'SOURCE'  # Assuming SOURCE is the target column

X = data[features]
y = data[target]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Save the trained model to a file
model_file_path = 'random_forest_model.pkl'
with open(model_file_path, 'wb') as model_file:
    pickle.dump(rf_model, model_file)
print(f"Model saved to {model_file_path}")

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Add recommendations based on predictions
def recommend(source):
    if source == 1:
        return "Maintain a balanced diet and consult a doctor for regular checkups."
    elif source == 0:
        return "Your health parameters are stable. Continue with your current lifestyle."
    else:
        return "Consider a detailed medical examination for potential issues."

# Apply the model to the entire dataset
data['Predicted_SOURCE'] = rf_model.predict(X_scaled)
data['Recommendation'] = data['Predicted_SOURCE'].apply(recommend)


Model saved to random_forest_model.pkl
Accuracy: 73.41%

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.86      0.79       392
           1       0.73      0.55      0.63       270

    accuracy                           0.73       662
   macro avg       0.73      0.71      0.71       662
weighted avg       0.73      0.73      0.73       662



In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import pickle

# Load the dataset
data_path = 'DocAssist Dataset.xlsx'
data = pd.read_excel(data_path, sheet_name='in')

# Preprocessing: Handle missing values for numeric columns only
numeric_columns = data.select_dtypes(include=['number']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Selecting features and target
features = [
    'HAEMATOCRIT', 'HAEMOGLOBINS', 'ERYTHROCYTE', 'LEUCOCYTE', 
    'THROMBOCYTE', 'MCH', 'MCHC', 'MCV', 'AGE'
]
target = 'SOURCE'  # Assuming SOURCE is the target column

X = data[features]
y = data[target]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save the scaler to a file
scaler_file_path = 'scaler.pkl'
with open(scaler_file_path, 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)
print(f"Scaler saved to {scaler_file_path}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Save the trained model to a file
model_file_path = 'random_forest_model.pkl'
with open(model_file_path, 'wb') as model_file:
    pickle.dump(rf_model, model_file)
print(f"Model saved to {model_file_path}")

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Add recommendations based on predictions
def recommend(source):
    if source == 1:
        return "Maintain a balanced diet and consult a doctor for regular checkups."
    elif source == 0:
        return "Your health parameters are stable. Continue with your current lifestyle."
    else:
        return "Consider a detailed medical examination for potential issues."

# Apply the model to the entire dataset
data['Predicted_SOURCE'] = rf_model.predict(X_scaled)
data['Recommendation'] = data['Predicted_SOURCE'].apply(recommend)

# Save sample testing and prediction to validate loading
sample_input = np.array([[40.5, 13.5, 4.7, 7.2, 250, 28.5, 32.5, 85.0, 35]])  # Example values
sample_input_scaled = scaler.transform(sample_input)
sample_prediction = rf_model.predict(sample_input_scaled)
sample_output = recommend(sample_prediction[0])

print("Sample Input Prediction:", sample_prediction[0])
print("Sample Recommendation:", sample_output)


Scaler saved to scaler.pkl
Model saved to random_forest_model.pkl
Accuracy: 73.41%

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.86      0.79       392
           1       0.73      0.55      0.63       270

    accuracy                           0.73       662
   macro avg       0.73      0.71      0.71       662
weighted avg       0.73      0.73      0.73       662

Sample Input Prediction: 0
Sample Recommendation: Your health parameters are stable. Continue with your current lifestyle.


In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Load the dataset
data_path = 'DocAssist Dataset.xlsx'
data = pd.read_excel(data_path, sheet_name='in')

# Preprocessing: Handle missing values for numeric columns only
numeric_columns = data.select_dtypes(include=['number']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Selecting features and target
features = [
    'HAEMATOCRIT', 'HAEMOGLOBINS', 'ERYTHROCYTE', 'LEUCOCYTE', 
    'THROMBOCYTE', 'MCH', 'MCHC', 'MCV', 'AGE'
]
target = 'SOURCE'  # Assuming SOURCE is the target column

X = data[features]
y = data[target]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for Random Forest Classifier
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model from grid search
best_rf_model = grid_search.best_estimator_

# Train the best model
best_rf_model.fit(X_train, y_train)

# Make predictions
y_pred = best_rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Add recommendations based on predictions
def recommend(source):
    if source == 1:
        return "Maintain a balanced diet and consult a doctor for regular checkups."
    elif source == 0:
        return "Your health parameters are stable. Continue with your current lifestyle."
    else:
        return "Consider a detailed medical examination for potential issues."

# Apply the model to the entire dataset
data['Predicted_SOURCE'] = best_rf_model.predict(X_scaled)
data['Recommendation'] = data['Predicted_SOURCE'].apply(recommend)


Accuracy: 73.72%

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.87      0.80       392
           1       0.74      0.54      0.63       270

    accuracy                           0.74       662
   macro avg       0.74      0.71      0.71       662
weighted avg       0.74      0.74      0.73       662



In [9]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import numpy as np

# Load the dataset
data_path = 'DocAssist Dataset.xlsx'
data = pd.read_excel(data_path, sheet_name='in')

# Preprocessing: Handle missing values for numeric columns only
numeric_columns = data.select_dtypes(include=['number']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Selecting features and target
features = [
    'HAEMATOCRIT', 'HAEMOGLOBINS', 'ERYTHROCYTE', 'LEUCOCYTE', 
    'THROMBOCYTE', 'MCH', 'MCHC', 'MCV', 'AGE'
]
target = 'SOURCE'  # Assuming SOURCE is the target column

X = data[features]
y = data[target]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for Gradient Boosting Classifier
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

gb_model = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model from grid search
best_gb_model = grid_search.best_estimator_

# Train the best model
best_gb_model.fit(X_train, y_train)

# Make predictions
y_pred = best_gb_model.predict(X_test)
y_pred_prob = best_gb_model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"ROC-AUC Score: {roc_auc:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Add recommendations based on predictions
def recommend(source):
    if source == 1:
        return "Maintain a balanced diet and consult a doctor for regular checkups."
    elif source == 0:
        return "Your health parameters are stable. Continue with your current lifestyle."
    else:
        return "Consider a detailed medical examination for potential issues."

# Apply the model to the entire dataset
data['Predicted_SOURCE'] = best_gb_model.predict(X_scaled)
data['Recommendation'] = data['Predicted_SOURCE'].apply(recommend)


Accuracy: 72.21%
ROC-AUC Score: 0.76

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.84      0.78       392
           1       0.70      0.55      0.62       270

    accuracy                           0.72       662
   macro avg       0.72      0.69      0.70       662
weighted avg       0.72      0.72      0.71       662



In [12]:
import pandas as pd
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, f1_score
import numpy as np

# Load the dataset
data_path = 'DocAssist Dataset.xlsx'
data = pd.read_excel(data_path, sheet_name='in')

# Preprocessing: Handle missing values for numeric columns only
numeric_columns = data.select_dtypes(include=['number']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Selecting features and target
features = [
    'HAEMATOCRIT', 'HAEMOGLOBINS', 'ERYTHROCYTE', 'LEUCOCYTE', 
    'THROMBOCYTE', 'MCH', 'MCHC', 'MCV', 'AGE'
]
target = 'SOURCE'  # Assuming SOURCE is the target column

X = data[features]
y = data[target]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for HistGradientBoostingClassifier
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_iter': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

hgb_model = HistGradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(estimator=hgb_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model from grid search
best_hgb_model = grid_search.best_estimator_

# Train the best model
best_hgb_model.fit(X_train, y_train)

# Make predictions
y_pred = best_hgb_model.predict(X_test)
y_pred_prob = best_hgb_model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"ROC-AUC Score: {roc_auc:.2f}")
print(f"F1 Score: {f1:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Add recommendations based on predictions
def recommend(source):
    if source == 1:
        return "Maintain a balanced diet and consult a doctor for regular checkups."
    elif source == 0:
        return "Your health parameters are stable. Continue with your current lifestyle."
    else:
        return "Consider a detailed medical examination for potential issues."

# Apply the model to the entire dataset
data['Predicted_SOURCE'] = best_hgb_model.predict(X_scaled)
data['Recommendation'] = data['Predicted_SOURCE'].apply(recommend)


Accuracy: 73.56%
ROC-AUC Score: 0.77
F1 Score: 0.73

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.87      0.80       392
           1       0.74      0.54      0.63       270

    accuracy                           0.74       662
   macro avg       0.74      0.71      0.71       662
weighted avg       0.74      0.74      0.73       662

