In [None]:
# Install data processing libraries
!pip install pandas numpy scikit-learn

In [None]:
# Install Dash and Dash Bootstrap Components
!pip install dash dash-bootstrap-components

In [None]:
# Install LangChain
!pip install --upgrade langchain
!pip install langchain-community

In [None]:
# Install joblib for saving preprocessing objects
!pip install joblib

In [None]:
# Install GPT4All
!pip install gpt4all

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from joblib import dump, load

In [8]:
# Load the dataset
url = './heart.csv'
df = pd.read_csv(url)

# Display the first few rows
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [9]:
# Check columns
print("Columns in the dataset:")
print(df.columns)

Columns in the dataset:
Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')


In [10]:
# Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())

Missing values in each column:
Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64


In [11]:
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
numerical_cols = [col for col in df.columns if col not in categorical_cols + ['HeartDisease']]

In [12]:
# Initialize LabelEncoders
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [13]:
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [14]:
# Separate features and target
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

In [15]:
# Save the LabelEncoders
dump(label_encoders, 'label_encoders.joblib')

# Save the StandardScaler
dump(scaler, 'scaler.joblib')

['scaler.joblib']

In [None]:
# Testing the model
from langchain.llms import GPT4All
model = GPT4All(model="/models/qwen2-1_5b-instruct-q4_0.gguf", n_threads=8)

from langchain import PromptTemplate

# Define the prompt template
template = """
You are an AI assistant specialized in cardiology.

Given the patient data:
{patient_data}

Provide a medical summary, assess the risk of heart disease, and suggest recommendations.

Medical Summary:
"""

prompt = PromptTemplate(
    input_variables=["patient_data"],
    template=template,
)

# Compose the chain using the prompt and the model
chain = prompt | model

# Example patient data string
patient_data_str = """
Age: 55
Resting Blood Pressure: 140
Cholesterol: 250
Fasting Blood Sugar: 0
Max Heart Rate: 150
Oldpeak: 2.3
Sex: Male
Chest Pain Type: Typical Angina
Resting ECG: Normal
Exercise Angina: Yes
ST Slope: Flat
"""

# Generate the response
response = chain.invoke({"patient_data": patient_data_str})

print(response)