In [112]:
import pandas as pd
import openpyxl as opxl
import os
# from tabpfn import TabPFNClassifier - don't have the CPU.
import numpy as np

In [None]:
# Importing scikit-learn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold, cross_val_score 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import SVC

In [114]:
# Checking if the file exists and the directory
# os.listdir()

In [115]:
# Load data from the Excel file
df_cholesterol = pd.read_excel('train_cholesterol.xlsx', sheet_name='Sheet1')
df_cholesterol.head()
print(df_cholesterol.shape[0])

100


In [116]:
unique_gender = df_cholesterol['Gender'].unique()
unique_habits = df_cholesterol['Dietary_Habits'].unique()
unique_familiyhist = df_cholesterol['Family_History'].unique()
unique_physical_activity = df_cholesterol['Physical_Activity'].unique()
print(unique_gender)
print(unique_habits)
print(unique_familiyhist)
print(unique_physical_activity)

['Female' 'Male']
['Healthy' 'Moderate' 'Unhealthy']
['No' 'Yes']
['Low' 'High' 'Moderate']


In [117]:
# Creating function to change the text to numerical values
def preprocess_categorical_data(df, mappings, one_hot_columns):
# Apply mappings for categorical variables
    for column, mapping in mappings.items():
        if column in df.columns:
            df[column] = df[column].map(mapping)

    # Apply one-hot encoding
    df = pd.get_dummies(df, columns=one_hot_columns)

    return df

mappings = {
    'Physical_Activity': {'Low': 3, 'Moderate': 2, 'High': 1},
    'Dietary_Habits': {'Unhealthy': 3, 'Moderate': 2, 'Healthy': 1},
    'Family_History': {'No': 0, 'Yes': 1}
}

# Define the columns for one-hot encoding
one_hot_columns = ['Gender']

In [118]:
def scale_features(X):
    # Remove zero-variance columns
    std_devs = X.std()
    zero_variance_cols = std_devs[std_devs == 0].index.tolist()
    if zero_variance_cols:
        X = X.drop(columns=zero_variance_cols)
        print(f"Removed zero-variance columns: {zero_variance_cols}")

    # Handle NaN/Inf values (replace with 0 or drop rows/columns)
    X = X.fillna(0)  # Replace NaNs with 0; adjust as needed
    X = X.replace([np.inf, -np.inf], 0)

    # Scale features
    scaler = StandardScaler()
    X_scaled_array = scaler.fit_transform(X)
    
    # Convert to DataFrame
    X_scaled = pd.DataFrame(X_scaled_array, columns=X.columns)
    
    return X_scaled, scaler

In [119]:
df_cholesterol.head()

Unnamed: 0,id,Age,Gender,BMI,Total_Cholesterol,LDL_Cholesterol,HDL_Cholesterol,Triglycerides,Physical_Activity,Dietary_Habits,Family_History,Need_Supplement
0,t1,68,Female,30.0,203.8,111.8,44.0,157.3,Low,Healthy,No,0
1,t2,58,Female,30.3,188.1,182.4,50.6,234.8,High,Moderate,No,1
2,t3,44,Male,20.9,194.3,74.7,60.1,97.2,High,Healthy,Yes,0
3,t4,72,Female,35.0,198.4,120.2,43.5,202.0,High,Healthy,No,1
4,t5,37,Male,22.9,277.3,186.1,36.7,156.3,Low,Moderate,Yes,1


In [120]:
mappings = {
    'Physical_Activity': {'Low': 3, 'Moderate': 2, 'High': 1},
    'Dietary_Habits': {'Unhealthy': 3, 'Moderate': 2, 'Healthy': 1},
    'Family_History': {'No': 0, 'Yes': 1}
}

# Define the columns for one-hot encoding
one_hot_columns = ['Gender']

In [121]:
df_cholesterol = preprocess_categorical_data(df_cholesterol,mappings, one_hot_columns)

In [122]:
X = df_cholesterol.drop(columns=['id','Need_Supplement'])
y = df_cholesterol['Need_Supplement']
X_scaled, scaler = scale_features(X)

In [123]:
# Splitting the data into training and validation sets (80% train, 20% vailidation)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [124]:
# 4. Hyperparameter tuning using GridSearchCV for Rundom Forest Classifier
param_grid = {
    "n_estimators": [50, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
}

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

grid_rf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1,
)

grid_rf.fit(X_train, y_train)

print("Random Forest Classifier - Best Parameters:", grid_rf.best_params_)
print("Random Forest Classifier - CV Accuracy (trening):", grid_rf.best_score_)

grid_rf_best_model = grid_rf.best_estimator_
cv_scores_grid_rf = cross_val_score(
    grid_rf_best_model,
    X_train, 
    y_train,
    cv=cv,  # Same RepeatedStratifiedKFold object
    scoring='accuracy',
    n_jobs=-1
)

print("\nCross-Validation Results (RepeatedStratifiedKFold):")
print(f"Mean Accuracy: {cv_scores_grid_rf.mean():.3f}")
print(f"Standard Deviation: {cv_scores_grid_rf.std():.3f}")


y_pred_rf = grid_rf_best_model.predict(X_test)
print("Random Forest Classifier - Accuracy (validation):", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classifier - F1 (validation):", f1_score(y_test, y_pred_rf))



Fitting 15 folds for each of 12 candidates, totalling 180 fits


Random Forest Classifier - Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Random Forest Classifier - CV Accuracy (trening): 0.9708333333333333

Cross-Validation Results (RepeatedStratifiedKFold):
Mean Accuracy: 0.971
Standard Deviation: 0.039
Random Forest Classifier - Accuracy (validation): 0.95
Random Forest Classifier - F1 (validation): 0.9090909090909091


In [125]:
# Defining model and hyperparameters for Logistic Regression
model_lr = LogisticRegression(max_iter=1000)
param_grid_lr = {
    'C': [0.1, 1, 10],            
    'penalty': ['l1', 'l2'],       
    'solver': ['liblinear', 'saga']  
}

cv = RepeatedStratifiedKFold(
    n_splits=5,       # Number of folds per repetition
    n_repeats=10,     # Number of repetitions
    random_state=42   # Seed for reproducibility
)

# GridSearchCV configuration
grid_lr = GridSearchCV(
    estimator=model_lr,
    param_grid=param_grid_lr,
    cv=cv,                          
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Fit for training
grid_lr.fit(X_train, y_train)

#Accuracy (trening)
print("Logistic Regression - Best parameters", grid_lr.best_params_)
print("Logistic Regression - CV Accuracy (trening):", grid_lr.best_score_)

grid_lr_best_model = grid_lr.best_estimator_
cv_scores_grid_lr = cross_val_score(
    grid_lr_best_model,
    X_train,
    y_train,
    cv=cv,            # Same cross-validation strategy
    scoring='accuracy',
    n_jobs=-1
)

print("\nCross-Validation Results (RepeatedStratifiedKFold):")
print(f"Mean Accuracy: {cv_scores_grid_lr.mean():.3f}")
print(f"Standard Deviation: {cv_scores_grid_lr.std():.3f}")

# Test accuracy
y_pred_lr = grid_lr_best_model.predict(X_test)
print("Logistic Regression - Accuracy (validation):", accuracy_score(y_test, y_pred_lr))
print("Logistic Regression - F1 (validation):", f1_score(y_test, y_pred_lr))


Fitting 50 folds for each of 12 candidates, totalling 600 fits
Logistic Regression - Best parameters {'C': 0.1, 'penalty': 'l2', 'solver': 'saga'}
Logistic Regression - CV Accuracy (trening): 0.7325

Cross-Validation Results (RepeatedStratifiedKFold):
Mean Accuracy: 0.733
Standard Deviation: 0.095
Logistic Regression - Accuracy (validation): 0.6
Logistic Regression - F1 (validation): 0.6


In [126]:
# Defining model and hyperparameters for SVM with  RBF
model_svm = SVC(kernel='rbf')
param_grid_svm = {
    'C': [0.01, 0.1, 1, 10, 100, 1000],       
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1, 10],
    'class_weight': [None, 'balanced']
}

cv = RepeatedStratifiedKFold(
    n_splits=10,      # Liczba foldów w każdej iteracji
    n_repeats=3,      # Liczba powtórzeń walidacji krzyżowej
    random_state=42   # Ziarno losowości dla reprodukowalności
)

# GridSearchCV config
grid_svm = GridSearchCV(
    estimator=model_svm,
    param_grid=param_grid_svm,
    cv=10,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Fit for training
grid_svm.fit(X_train, y_train)

# Accuracy (trening)
print("\nSVM - Best parameters:", grid_svm.best_params_)
print("SVM - Accuracy (trening):", grid_svm.best_score_)

grid_svm_best_model = grid_svm.best_estimator_
cv_scores_grid_svm = cross_val_score(
    grid_svm_best_model,
    X_train,
    y_train,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)

print("\nCross-Validation Results (RepeatedStratifiedKFold):")
print(f"Mean Accuracy: {cv_scores_grid_svm.mean():.3f}")
print(f"Standard Deviation: {cv_scores_grid_svm.std():.3f}")

# Test accuracy
y_pred_svm = grid_svm_best_model.predict(X_test)
print("SVM - Accuracy (test):", accuracy_score(y_test, y_pred_svm))
print("SVM - F1 (test):", f1_score(y_test, y_pred_svm))


Fitting 10 folds for each of 84 candidates, totalling 840 fits

SVM - Best parameters: {'C': 10, 'class_weight': None, 'gamma': 'scale'}
SVM - Accuracy (trening): 0.8375

Cross-Validation Results (RepeatedStratifiedKFold):
Mean Accuracy: 0.842
Standard Deviation: 0.096
SVM - Accuracy (test): 0.85
SVM - F1 (test): 0.7692307692307693


In [127]:
# Load data from the Excel file
df_predict_cholesterol = pd.read_excel('predict_cholesterol.xlsx', sheet_name='Sheet1')
df_predict_cholesterol = preprocess_categorical_data(df_predict_cholesterol,mappings, one_hot_columns)


In [128]:
X_val = df_predict_cholesterol.drop(columns=['id'])
X_val_scaled, scaler = scale_features(X_val)

In [129]:
# Function to predict using multiple models
def predict_with_models(models, X):
    predictions = {}
    
    for model_name, model in models.items():
        # Generate predictions for each model
        predictions[model_name] = model.predict(X)
    
    return predictions

In [130]:
# Define models
models = {
    'Random Forest Classifier': grid_rf,
    'Logistic Regression' : grid_lr,
    'AVM': grid_svm

}
# Generate predictions for all models
predictions = predict_with_models(models, X_val_scaled)

# Display the predictions
for model_name, preds in predictions.items():
    print(f"{model_name} Predictions:")
    print(preds)

Random Forest Classifier Predictions:
[0 1 1 1 1 1 1 0 0 0 0 1 1 1 0 1 0 0 1 1]
Logistic Regression Predictions:
[0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 0 1 1]
AVM Predictions:
[0 1 1 1 1 1 1 0 1 0 0 1 1 1 0 1 0 0 0 1]


In [131]:
def write_predictions_to_excel(models, predictions, input_file, output_file):
    # Load the original dataset
    df = pd.read_excel(input_file) # 'predict_cholesterol.xlsx'

    # Add predictions to the DataFrame
    for model_name, preds in predictions.items():
        df[model_name + '_Predictions'] = preds
    # Save the updated DataFrame to a new Excel file
    df.to_excel(output_file, index=False)

In [132]:
# Write predictions to the new Excel with my surname and name
write_predictions_to_excel(models, predictions, 'predict_cholesterol.xlsx', 'predict_cholesterol_borkowska_joanna.xlsx')

print("Predictions written to predict_cholesterol_borkowska_joanna.xlsx")



Predictions written to predict_cholesterol_borkowska_joanna.xlsx
