In [12]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd

In [13]:

from types import SimpleNamespace

ucirepo_ids = {
    "iris": 53,
    "heart_disease": 45,
    "molecular_biology": 69,
    "breast_cancer": 17,
    "adult": 2,
    "bank_marketing": 222,
    "student_performance": 320,
    "wine": 109,
    "air_quality": 360,
    "mushroom": 73
}

DATASET_NAME = "heart_disease"  # Example dataset name
TEST_SIZE = 0.2

def custom_data():
    
    data = {
            'age': ['youth', 'youth', 'middle aged', 'senior', 'senior', 'senior', 'middle aged',
                    'youth', 'youth', 'senior', 'youth', 'middle aged', 'middle aged', 'senior'],
            'income': ['high', 'high', 'high', 'medium', 'low', 'low', 'low',
                    'medium', 'low', 'medium', 'medium', 'medium', 'high', 'medium'],
            'student': ['no', 'no', 'no', 'no', 'yes', 'yes', 'yes',
                        'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no'],
            'credit_rating': ['fair', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent',
                            'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'excellent'],
            'buys_computer': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes',
                            'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
        }
        
    df = pd.DataFrame(data)
    
    # Split into features and target
    features = df.drop(columns='buys_computer')
    targets = df['buys_computer']
    targets= pd.DataFrame(targets.values.reshape(-1, 1), columns=['buys_computer'])

    
    # Variable info
    variable_info = {
        col: {
            'type': 'categorical',
            'unique_values': df[col].unique().tolist()
        } for col in df.columns
    }

    # Metadata
    metadata = {
        'source': 'Simulated AllElectronics dataset',
        'description': 'Customer attributes and their decision to buy a computer',
        'num_samples': len(df),
        'num_features': features.shape[1],
        'target_column': 'buys_computer',
        'class_labels': sorted(df['buys_computer'].unique().tolist())
    }

    # Build nested structure
    return SimpleNamespace(
        data=SimpleNamespace(
            features=features,
            targets=targets,
            feature_names=features.columns.tolist(),
            target_names=sorted(targets.iloc[:,0].unique()),
            # frame=df
        ),
        metadata=metadata,
        variables=variable_info
    )
    
    

def fetch_dataframe(dataframe_name):
    
    if dataframe_name == "custom_data":
        df = custom_data()
        
        # metadata 
        print(df.metadata) 
        
        # variable information 
        print(df.variables) 
    
        return df
    
    if dataframe_name in ucirepo_ids:
        # fetch dataset 
        df = fetch_ucirepo(id=ucirepo_ids[dataframe_name],) 

        # # data (as pandas dataframes) 
        X = df.data.features 
        y = df.data.targets 
        
        # metadata 
        print(df.metadata) 
        
        # variable information 
        print(df.variables) 
        
        return df
    else:
        raise ValueError(f"Dataset '{dataframe_name}' not found in UCI repository.")


In [14]:
def __train_test_split(X, y, test_size = 0.2, shuffle_and_stratify = True):
    
    if test_size < 0 or test_size > 1:
        raise ValueError("test_size must be between 0 and 1")
   
    if len(X) != len(y):
        raise ValueError("Features and targets must have the same length.")

    
    if shuffle_and_stratify == False:
    
        train_size = 1 - test_size
        train_index = int(len(X) * train_size)
        
        X_train = X[0: train_index]
        X_test = X[train_index:]
        
        y_train = y[0: train_index]
        y_test = y[train_index:]
        
        return X_train, X_test, y_train, y_test
    else:
        labels = y.iloc[:,0].unique()
        X_train = pd.DataFrame(columns=X.columns)
        y_train = pd.DataFrame(columns=y.columns)
        X_test = pd.DataFrame(columns=X.columns)
        y_test = pd.DataFrame(columns=y.columns)
        
        train_size = 1 - test_size
        

        for label in labels :
            y_rows = y[y.iloc[:,0] == label]            
            X_rows = X.loc[y_rows.index]
            
            train_index = int(len(X_rows) * train_size)
            
            X_train = pd.concat([X_train, X_rows.iloc[:train_index]], ignore_index=False)
            y_train = pd.concat([y_train, y_rows.iloc[:train_index]] , ignore_index=False)
            
            X_test = pd.concat([X_test, X_rows[train_index:]], ignore_index=False)
            y_test = pd.concat([y_test, y_rows[train_index:]], ignore_index=False)

        return X_train, X_test, y_train, y_test
    

In [15]:
df = fetch_dataframe(DATASET_NAME)

# print(df)

{'uci_id': 45, 'name': 'Heart Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/45/data.csv', 'abstract': '4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 303, 'num_features': 13, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['num'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C52P4X', 'creators': ['Andras Janosi', 'William Steinbrunn', 'Matthias Pfisterer', 'Robert Detrano'], 'intro_paper': {'ID': 231, 'type': 'NATIVE', 'title': 'International application of a new probability algorithm for the diagnosis of coronary artery disease.', 'authors': 'R. Detrano, A. Jánosi, W. Steinbrunn, M

In [16]:

from sklearn.model_selection import train_test_split

X = df.data.features
y = df.data.targets

X_train, X_test, y_train, y_test = __train_test_split(X, y , test_size=TEST_SIZE, shuffle_and_stratify=True)



# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE)


print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (241, 13)
X_test shape: (62, 13)
y_train shape: (241, 1)
y_test shape: (62, 1)


  X_train = pd.concat([X_train, X_rows.iloc[:train_index]], ignore_index=False)
  X_test = pd.concat([X_test, X_rows[train_index:]], ignore_index=False)


In [None]:
X_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0
5,56,1,2,120,236,0,0,178,0,0.8,1,0.0,3.0
7,57,0,4,120,354,0,0,163,1,0.6,1,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,70,1,4,145,174,0,0,125,1,2.6,3,0.0,7.0
146,57,1,4,165,289,1,2,124,0,1.0,2,3.0,7.0
153,55,1,4,160,289,0,2,145,1,0.8,2,1.0,7.0
161,77,1,4,125,304,0,2,162,1,0.0,1,3.0,3.0


In [None]:
y_train

Unnamed: 0,num
0,0
3,0
4,0
5,0
7,0
...,...
136,4
146,4
153,4
161,4


In [None]:
# labels = y_train.iloc[:,0].unique()

# for label in labels:
    
#     label_rows = y_train[y_train.iloc[:,0] == label]
    
#     row_indices = label_rows.index
    
#     print(row_indices)
    
#     corresponding_rows = X_train.loc[label_rows.index]
    
#     for ind in row_indices:
        
        
        
#         if ind not in X_train.index:
#             print(f"{ind} not in X_train for label {label}")
    

In [25]:
from math import exp, sqrt, pi
import numpy as np
import pandas as pd

ALPHA = 1e-3


def _is_continous(X: pd.Series) -> bool:
    return np.issubdtype(X.dtype, np.number) and (len(X.unique()) / len(X) > 0.001)


def gaussian(x_k, mean, std):
    if std < 1e-9:
        return 1.0 if abs(x_k - mean) < 1e-9 else 1e-9
    coeff = 1 / (sqrt(2 * pi) * std)
    exponent = exp(-((x_k - mean) ** 2) / (2 * std ** 2))
    return coeff * exponent


def naive_bayes_classifier_preds(X_train, y_train, X_test):
    y_preds = []
    attributes = X_train.columns
    labels = y_train.iloc[:, 0].unique()

    label_probs = {}
    cond_probs = {}
    mean_store = {}
    std_store = {}

    # Calculate prior probabilities
    total_rows = len(y_train)
    for label in labels:
        count = (y_train.iloc[:, 0] == label).sum()
        label_probs[label] = (count + ALPHA) / (total_rows + ALPHA * len(labels))

        label_indices = y_train[y_train.iloc[:, 0] == label].index
        label_X = X_train.loc[label_indices]

        for attr in attributes:
            if _is_continous(X_train[attr]):
                mean = label_X[attr].mean()
                std = max(label_X[attr].std(ddof=0), 1e-3)
                mean_store[(attr, label)] = mean
                std_store[(attr, label)] = std
            else:
                value_counts = label_X[attr].value_counts()
                total = len(label_X)
                for val, count in value_counts.items():
                    cond_probs[(attr, val, label)] = (count + ALPHA) / (total + ALPHA * len(value_counts))

    # Predict
    for _, test_row in X_test.iterrows():
        class_probs = {}
        for label in labels:
            prob = label_probs[label]
            for attr in attributes:
                val = test_row[attr]
                if _is_continous(X_train[attr]):
                    mean = mean_store.get((attr, label), 0.0)
                    std = std_store.get((attr, label), 1e-3)
                    prob *= gaussian(val, mean, std)
                else:
                    prob *= cond_probs.get((attr, val, label), ALPHA)
            class_probs[label] = prob

        # Choose label with highest posterior probability
        max_label = max(class_probs, key=class_probs.get)
        y_preds.append(str(max_label))

    print("Predictions Length:", len(y_preds))
    return y_preds


In [26]:
y_preds = naive_bayes_classifier_preds(X_train, y_train, X_test)

Predictions Length: 62


In [27]:
def __classification_report(y_true, y_pred):
    # Ensure y_true is a Series
    if isinstance(y_true, pd.DataFrame):
        y_true = y_true.iloc[:, 0]

    if len(y_true) != len(y_pred):
        raise ValueError("prediction does not have same number of tuples as the true value set")
    
    labels = pd.Series(y_true).unique()
    
    for label in labels:
        P = N = TP = FP = TN = FN = 0
        
        for i in range(len(y_pred)):
            true_label = y_true.iloc[i] if hasattr(y_true, 'iloc') else y_true[i]
            pred_label = y_pred[i]
            
            if true_label == label:
                P += 1
            else:
                N += 1
                
            if true_label == label and pred_label == label:
                TP += 1
            elif true_label == label and pred_label != label:
                FN += 1
            elif true_label != label and pred_label == label:
                FP += 1
            elif true_label != label and pred_label != label:
                TN += 1
        
        accuracy = (TP + TN) / (P + N) if (P + N) > 0 else 0.0
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
        f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        sensitivity = TP / P if P > 0 else 0.0
        specificity = TN / N if N > 0 else 0.0
        support = P + N

        print(f"Label: {label}")
        print(f"  Accuracy   : {accuracy:.2f}")
        print(f"  Precision  : {precision:.2f}")
        print(f"  Recall     : {recall:.2f}")
        print(f"  F1 Score   : {f1_score:.2f}")
        print(f"  Sensitivity: {sensitivity:.2f}")
        print(f"  Specificity: {specificity:.2f}")
        print(f"  Support    : {support}")
        print("-" * 30)
        
    #overall accuracy
    
    mathced = 0
    for i in range(len(y_pred)):
        true_label = y_true.iloc[i] if hasattr(y_true, 'iloc') else y_true[i]
        pred_label = y_pred[i]
        
        if true_label == pred_label:
            mathced += 1
    
    overall_accuracy = mathced / len(y_pred) if len(y_pred) > 0 else 0.0
    print(f"Overall Accuracy: {overall_accuracy:.2f}")


In [29]:
from sklearn.metrics import classification_report

# Convert to string labels (optional)
y_test = [str(y) for y in y_test]
y_preds = [str(y) for y in y_preds]

# Generate report
report = classification_report(y_test, y_preds)

print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.88      0.81        33
           1       0.15      0.18      0.17        11
           2       0.20      0.12      0.15         8
           3       0.20      0.14      0.17         7
           4       0.00      0.00      0.00         3

    accuracy                           0.53        62
   macro avg       0.26      0.27      0.26        62
weighted avg       0.47      0.53      0.50        62



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [30]:
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder


X_sk = df.data.features
y_sk = df.data.targets

encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(X_sk)

# print(f"X_encoded {X_encoded}")

X_train_sk, X_test_sk, y_train_sk, y_test_sk = train_test_split(X_encoded, y_sk , test_size=0.2, random_state=42)

# print("X_train", X_train_sk)

model = CategoricalNB()  # Use CategoricalNB for categorical data
# model = GaussianNB()  # Use GaussianNB for numerical data

model.fit(X_train_sk, y_train_sk)

y_preds_sklearn = model.predict(X_test_sk)

report_sklearn = classification_report(y_test_sk, y_preds_sklearn)

print("Classification Report (sklearn):")
print(report_sklearn)

ValueError: Input X contains NaN.
CategoricalNB does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [32]:
import pandas as pd

# Convert y_test_sk to a DataFrame
y_test = pd.DataFrame(y_test_sk.values.reshape(-1, 1), columns=['buys_computer'])

# Create a comparison list
ys = [[y_test.iloc[i, 0], y_preds[i], y_preds_sklearn[i]] for i in range(len(y_test))]

# Print comparison
print("Comparison of Predictions:")
for i, (actual, naive_bayes_pred, sklearn_pred) in enumerate(ys):
    print(f"Sample {i+1}: Actual: {actual}, Naive Bayes: {naive_bayes_pred}, sklearn: {sklearn_pred}")


NameError: name 'y_preds_sklearn' is not defined

In [None]:
# train_test_split??