In [22]:
#!pip install category_encoders
#!pip install selenium
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import TargetEncoder,OneHotEncoder,LabelEncoder,MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score , confusion_matrix , classification_report
from category_encoders import TargetEncoder
import lightgbm as lgb
import ipywidgets as widgets
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import re
#le = LabelEncoder()
# Function to load the model
def load_model():
    try:
        # Load the dataset (this is where your model was trained)
        df = pd.read_csv('heart_2022_with_nans.csv')

        # changes cols with only 'yes' 'no' values to 1,0
        # also changes sex column- male = 1, female = 0
        df_clean = df.copy()
        df_clean.drop_duplicates()
        df_clean.dropna(inplace=True)

        # cols_to_keep = ['HighRiskLastYear','BMI' ,'PhysicalActivities', 'HadKidneyDisease', 'HadCOPD', 'GeneralHealth', 'Sex', 'State', 'AgeCategory', 'SmokerStatus', 'BlindOrVisionDifficulty', 'DeafOrHardOfHearing', 'HadDiabetes', 'HadArthritis', 'HadDepressiveDisorder', 'HadAsthma', 'HadStroke', 'HadAngina', 'RemovedTeeth', 'AlcoholDrinkers', 'HadHeartAttack']
        cols_to_keep = ['HighRiskLastYear','BMI' ,'PhysicalActivities', 'HadKidneyDisease', 'HadCOPD', 'GeneralHealth', 'Sex', 'AgeCategory', 'SmokerStatus', 'HadDiabetes', 'HadArthritis', 'HadStroke', 'HadAngina', 'AlcoholDrinkers', 'HadHeartAttack']
        df_clean = df_clean[cols_to_keep]
        df_clean.head()


        # cols_to_binary = ['HighRiskLastYear', 'PhysicalActivities', 'HadKidneyDisease', 'HadCOPD', 'BlindOrVisionDifficulty', 'DeafOrHardOfHearing', 'HadDiabetes', 'HadArthritis', 'HadDepressiveDisorder', 'HadAsthma', 'HadStroke', 'HadAngina', 'AlcoholDrinkers', 'HadHeartAttack']
        cols_to_binary = ['HighRiskLastYear', 'PhysicalActivities', 'HadKidneyDisease', 'HadCOPD', 'HadDiabetes', 'HadArthritis', 'HadStroke', 'HadAngina', 'AlcoholDrinkers', 'HadHeartAttack']
        for col in cols_to_binary:
            df_clean[col] = np.where(df_clean[col] == 'Yes',1,0)

        df_clean['Sex'] = np.where(df_clean['Sex'] == 'Male', 1,0)


        # Separate features and target variable
        X = df_clean.drop('HadHeartAttack', axis=1)
        y = df_clean['HadHeartAttack']

        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

        # Apply Target Encoding
        # encoder = TargetEncoder(cols=['GeneralHealth', 'State', 'AgeCategory', 'SmokerStatus', 'RemovedTeeth'])
        label_encoders = TargetEncoder(cols=['GeneralHealth', 'AgeCategory', 'SmokerStatus'])
        X_train_encoded = label_encoders.fit_transform(X_train, y_train)
        X_val_encoded = label_encoders.transform(X_val)
        smote = SMOTE(sampling_strategy='minority', random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X_train_encoded, y_train)

        # Create LightGBM dataset
        train_data = lgb.Dataset(X_resampled, label=y_resampled)
        val_data = lgb.Dataset(X_val_encoded, label=y_val)

        # Define parameters
        params = {
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'metric': 'binary_logloss',
            'max_depth': 20,
            'num_leaves': 63,
            'learning_rate': 0.001,
            'feature_fraction': 0.8,
            'lambda_l1': 1,
            'lambda_l2': 0.1,
            'tree_learner': 'feature',
            'is_unbalance': 'false',
            'sigmoid': 2.0,
        }

        # Train model
        eval_results = {}
        lgb_model_2 = lgb.train(params, train_data, valid_sets=[train_data, val_data], valid_names=["training", "valid_0"], callbacks=[lgb.early_stopping(stopping_rounds=500),lgb.record_evaluation(eval_results)])
        # Return the trained model and label encoders
        return lgb_model_2, label_encoders, X.columns

    except Exception as e:
        print(f"Error loading model: {e}")
        return None, None, None


def get_links(word):
    url = f"https://search.cdc.gov/search/?query={word}&dpage=1"

    list = []
    # Set up Chrome options
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Headless mode
    options.add_argument("--no-sandbox")  # Disable sandboxing (useful in Docker)
    options.add_argument("--disable-dev-shm-usage")  # Avoid shared memory issues
    options.add_argument("--remote-debugging-port=9222")  # Debugging port
    options.add_argument("--disable-gpu")  # Disable GPU hardware acceleration

    driver = webdriver.Chrome(options=options)
    # Open the webpage
    driver.get(url)

    # Wait for the content to load (you can adjust the sleep time as needed or use WebDriverWait)
    time.sleep(5)

    # Get the page source and pass it to BeautifulSoup
    html = driver.page_source
    soup = BeautifulSoup(html, 'lxml')

    # Close the WebDriver
    driver.quit()

    # Now use BeautifulSoup to parse the HTML and extract data
    # Example: find all search result titles
    titles = soup.find_all('div', {"class": re.compile('.*result-title d-flex*')})
    for title in titles[:5]:
        list.append([title.text,title.find('a').get('href')])
    output = ""
    for item in list:
        output += f"Title: {item[0]}\nLink: {item[1]}\n\n"
    print(f"{output}")



# Function to make predictions
def predict(model, features):
    try:
        prediction = model.predict(features)  # No need to wrap in np.array() again
        return prediction
    except Exception as e:
        print(f"Error making prediction 2: {e}")
        return None

def on_predict_button_clicked(b):
    try:
        # Ensure we are including all 14 features for prediction
        widg_features_names = [
            'HighRiskLastYear','BMI' ,'PhysicalActivities', 'HadKidneyDisease', 'HadCOPD', 'GeneralHealth', 'Sex', 'AgeCategory', 'SmokerStatus', 'HadDiabetes', 'HadArthritis', 'HadStroke', 'HadAngina', 'AlcoholDrinkers'
        ]

        # Get the widget values
        widg_features_values = [
            HighRiskLastYear.value, bmi.value, PhysicalActivities.value, HadKidneyDisease.value, HadCOPD.value,
            GeneralHealth.value, Sex.value, AgeCategory.value, SmokerStatus.value, HadDiabetes.value,
            HadArthritis.value, HadStroke.value, HadAngina.value, AlcoholDrinkers.value
        ]

        # Ensure the features dictionary has all the required values
        features_dict = dict(zip(widg_features_names, widg_features_values))

        # Convert to DataFrame
        features_dict = pd.DataFrame([features_dict])
        cols_to_binary = ['HighRiskLastYear', 'PhysicalActivities', 'HadKidneyDisease', 'HadCOPD', 'HadDiabetes', 'HadArthritis', 'HadStroke', 'HadAngina', 'AlcoholDrinkers']
        for col in cols_to_binary:
            features_dict[col] = np.where(features_dict[col] == 'Yes',1,0)

        features_dict['Sex'] = np.where(features_dict['Sex'] == 'Male', 1,0)
        # Apply Target Encoding
        # encoder = TargetEncoder(cols=['GeneralHealth', 'State', 'AgeCategory', 'SmokerStatus', 'RemovedTeeth'])
        #label_encoders = TargetEncoder(cols=['GeneralHealth', 'AgeCategory', 'SmokerStatus'])
        #X_train_encoded = label_encoders.fit_transform(X_train, y_train)
        #X_val_encoded = label_encoders.transform(X_val)
        #smote = SMOTE(sampling_strategy='minority', random_state=42)
        #X_resampled, y_resampled = smote.fit_resample(X_train_encoded, y_train)

        # Label encode categorical columns
        features_dict = label_encoders.transform(features_dict)  # Use transform() method directly

        # Ensure features are ordered the same way as the training features
        ordered_features = [features_dict[col].values[0] for col in model_features]

        # Convert the ordered features to a 2D numpy array
        features_array = np.array(ordered_features).reshape(1, -1)

        # Ensure the features array has the correct shape
        if features_array.shape[0] != 1 or features_array.shape[1] != len(model_features):
            raise ValueError(f"Incorrect shape of features array: {features_array.shape}. Expected (1, {len(model_features)})")

        # Get prediction probabilities (using `predict_proba` for binary classification)
        prediction_prob = model.predict(features_array)  # Probability of the positive class
        print(prediction_prob)

        # Display the result
        result = f"Prediction: {'Heart Attack Risk' if prediction_prob > 0.5 else 'No Heart Attack Risk'}"
        display(widgets.Label(value=result))
        if prediction_prob > 0.5:
          print(f"Find below list of recommended topics related to Heart General Health from Centers for Disease Control and Prevention:\n\nWait 5 Seconds\n\n")
          get_links('General Health related to Heart')
          print(f'\n\nBased on our analysis the most important factors related to Heart Attacks are:\n\nBMI\nPhysical Activities\nKidney Disease\nChronic obstructive pulmonary disease\nAge\nSmoking\nDiabetes\nArthritis\nStroke\nAngina\nAlcohol\nIf you would like to know more about them or other health information, write below:\n\n')

          # Create a counter variable to limit input to 5 times
          factor_input_counter = 0

          # Factor input widget
          factor_widget = widgets.Text(
              value='',
              placeholder='Type a factor',
              description='Factor:',
              disabled=False
          )

          # Search button widget
          search_button = widgets.Button(description="Search for Factor")
          display(factor_widget, search_button)

          # Function to handle the factor search when the button is clicked
          def on_search_button_click(b):
              nonlocal factor_input_counter

              # Get the user-entered factor
              factor = factor_widget.value
              print(f"User entered: {factor}")

              # Search for the factor
              get_links(factor)

              # Increase the counter on each input submission
              factor_input_counter += 1

              # Stop after 5 entries
              if factor_input_counter >= 5:
                  factor_widget.disabled = True  # Disable the input widget
                  search_button.disabled = True  # Disable the button
                  print("You have reached the maximum number of entries (5). No more input will be accepted.")

          # Observe button click event
          search_button.on_click(on_search_button_click)


    except Exception as e:
        print(f"Error in prediction process: {e}")
        display(widgets.Label(value=f"Error: {str(e)}"))


# Load the model and label encoders
model, label_encoders, model_features = load_model()

# Define the widgets (adjusting them as necessary)
HighRiskLastYear = widgets.Dropdown(options=['No', 'Yes'], description='HighRiskLastYear')
bmi = widgets.FloatText(value=29.0, description='BMI')
PhysicalActivities = widgets.Dropdown(options=['Yes', 'No'], description='PhysicalActivities')
HadKidneyDisease = widgets.Dropdown(options=['No', 'Yes'], description='HadKidneyDisease')
HadCOPD = widgets.Dropdown(options=['No', 'Yes'], description='HadCOPD')
GeneralHealth = widgets.Dropdown(options=['Very good', 'Fair', 'Good', 'Excellent', 'Poor'], description='GeneralHealth')
Sex = widgets.Dropdown(options=['Female', 'Male'], description='Sex')
AgeCategory = widgets.Dropdown(options=['Age 65 to 69', 'Age 70 to 74', 'Age 75 to 79', 'Age 80 or older', 'Age 50 to 54', 'Age 40 to 44', 'Age 60 to 64', 'Age 55 to 59', 'Age 45 to 49', 'Age 35 to 39', 'Age 25 to 29', 'Age 30 to 34', 'Age 18 to 24'], description='AgeCategory')
SmokerStatus = widgets.Dropdown(options=['Former smoker', 'Never smoked', 'Current smoker - now smokes every day', 'Current smoker - now smokes some days'], description='SmokerStatus')
HadDiabetes = widgets.Dropdown(options=['No', 'Yes', 'Yes, but only during pregnancy (female)', 'No, pre-diabetes or borderline diabetes'], description='HadDiabetes')
HadArthritis = widgets.Dropdown(options=['Yes', 'No'], description='HadArthritis')
HadStroke = widgets.Dropdown(options=['No', 'Yes'], description='HadStroke')
HadAngina = widgets.Dropdown(options=['No', 'Yes'], description='HadAngina')
AlcoholDrinkers = widgets.Dropdown(options=['No', 'Yes'], description='AlcoholDrinkers')

# Create the prediction button
predict_button = widgets.Button(description="Predict", button_style="success")
predict_button.on_click(on_predict_button_clicked)



[LightGBM] [Info] Number of positive: 186014, number of negative: 186014
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036941 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1040
[LightGBM] [Info] Number of data points in the train set: 372028, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	training's binary_logloss: 0.621401	valid_0's binary_logloss: 0.625872


In [28]:
# Display all widgets
display(
    HighRiskLastYear, bmi, PhysicalActivities, HadKidneyDisease, HadCOPD, GeneralHealth,
    Sex, AgeCategory, SmokerStatus, HadDiabetes, HadArthritis, HadStroke,
    HadAngina, AlcoholDrinkers, predict_button
)

Dropdown(description='HighRiskLastYear', options=('No', 'Yes'), value='No')

FloatText(value=29.0, description='BMI')

Dropdown(description='PhysicalActivities', options=('Yes', 'No'), value='Yes')

Dropdown(description='HadKidneyDisease', options=('No', 'Yes'), value='No')

Dropdown(description='HadCOPD', options=('No', 'Yes'), value='No')

Dropdown(description='GeneralHealth', options=('Very good', 'Fair', 'Good', 'Excellent', 'Poor'), value='Very …

Dropdown(description='Sex', options=('Female', 'Male'), value='Female')

Dropdown(description='AgeCategory', index=8, options=('Age 65 to 69', 'Age 70 to 74', 'Age 75 to 79', 'Age 80 …

Dropdown(description='SmokerStatus', options=('Former smoker', 'Never smoked', 'Current smoker - now smokes ev…

Dropdown(description='HadDiabetes', options=('No', 'Yes', 'Yes, but only during pregnancy (female)', 'No, pre-…

Dropdown(description='HadArthritis', options=('Yes', 'No'), value='Yes')

Dropdown(description='HadStroke', options=('No', 'Yes'), value='No')

Dropdown(description='HadAngina', options=('No', 'Yes'), value='No')

Dropdown(description='AlcoholDrinkers', options=('No', 'Yes'), value='No')

Button(button_style='success', description='Predict', style=ButtonStyle())

[0.50616217]


Label(value='Prediction: Heart Attack Risk')

Find below list of recommended topics related to Heart General Health from Centers for Disease Control and Prevention:

Wait 5 Seconds


Title: Disability and Health Related ConditionsLast Updated: May 2024
Link: https://www.cdc.gov/ncbddd/disabilityandhealth/relatedconditions.html

Title: About Other Conditions Related to Heart Disease | Heart DiseaseLast Updated: May 2024
Link: https://www.cdc.gov/heart-disease/about/other-conditions-related-to-heart-disease.html

Title: Disability and Health Data System (DHDS) Data Guide General Health ConditionsLast Updated: Jul 2024
Link: https://www.cdc.gov/ncbddd/disabilityandhealth/dhds/data-guide/health-topics-general.html

Title: About Heart Disease and Mental Health | Heart DiseaseLast Updated: May 2024
Link: https://www.cdc.gov/heart-disease/about/about-heart-disease-and-mental-health.html

Title: Other Possible Health Issues for People with Heart Defects | Congenital Heart Defects (CHDs)Last Updated: Oct 2024
Link: https://www.cdc.gov/hear

Text(value='', description='Factor:', placeholder='Type a factor')

Button(description='Search for Factor', style=ButtonStyle())

User entered: BMI
Title: Adult BMI CalculatorLast Updated: Sep 2024
Link: https://www.cdc.gov/bmi/adult-calculator/

Title: Child and Teen BMI CalculatorLast Updated: Sep 2024
Link: https://www.cdc.gov/bmi/child-teen-calculator/

Title: Child and Teen BMI CategoriesLast Updated: Nov 2024
Link: https://www.cdc.gov/bmi/child-teen-calculator/bmi-categories.html

Title: Child and Teen BMI Calculator WidgetLast Updated: Sep 2024
Link: https://www.cdc.gov/bmi/child-teen-calculator/widget.html

Title: Adult BMI CategoriesLast Updated: Jul 2024
Link: https://www.cdc.gov/bmi/adult-calculator/bmi-categories.html


