In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math


def accuracy_score(y_true, y_pred):
    return round(float(sum(y_pred == y_true)) / float(len(y_true)) * 100, 2)


def pre_processing(df):
    X = df.drop([df.columns[-1]], axis=1)
    y = df[df.columns[-1]]
    return X, y


def train_naive_bayes(X, y):
    features = list(X.columns)
    train_size = X.shape[0]
    num_feats = X.shape[1]
    likelihoods = {}
    class_priors = {}
    pred_priors = {}

    for feature in features:
        likelihoods[feature] = {}
        pred_priors[feature] = {}

        for feat_val in np.unique(X[feature]):
            pred_priors[feature].update({feat_val: 0})

            for outcome in np.unique(y):
                likelihoods[feature].update({feat_val + '_' + outcome: 0})
                class_priors.update({outcome: 0})

    class_priors = calc_class_prior(y, class_priors, train_size)
    likelihoods = calc_likelihoods(X, y, likelihoods, features)
    pred_priors = calc_predictor_prior(X, pred_priors, features, train_size)

    return features, likelihoods, class_priors, pred_priors


def calc_class_prior(y, class_priors, train_size):
    for outcome in np.unique(y):
        outcome_count = sum(y == outcome)
        class_priors[outcome] = outcome_count / train_size
    return class_priors


def calc_likelihoods(X, y, likelihoods, features):
    for feature in features:
        for outcome in np.unique(y):
            outcome_count = sum(y == outcome)
            feat_likelihood = X[feature][y[y == outcome].index.values.tolist(
            )].value_counts().to_dict()

            for feat_val, count in feat_likelihood.items():
                likelihoods[feature][feat_val + '_' +
                                     outcome] = count / outcome_count
    return likelihoods


def calc_predictor_prior(X, pred_priors, features, train_size):
    for feature in features:
        feat_vals = X[feature].value_counts().to_dict()

        for feat_val, count in feat_vals.items():
            pred_priors[feature][feat_val] = count / train_size
    return pred_priors


def predict_naive_bayes(X_test, features, likelihoods, class_priors, pred_priors):
    results = []
    for query in X_test:
        probs_outcome = {}
        for outcome in class_priors.keys():
            prior = class_priors[outcome]
            likelihood = 1
            evidence = 1

            for feat, feat_val in zip(features, query):
                likelihood *= likelihoods[feat].get(
                    feat_val + '_' + outcome, 1)
                evidence *= pred_priors[feat].get(feat_val, 1)

            posterior = (likelihood * prior) / (evidence)
            probs_outcome[outcome] = posterior

        result = max(probs_outcome, key=probs_outcome.get)
        results.append(result)

    return np.array(results)

In [6]:
if __name__ == "__main__":
    # Assuming df is your DataFrame loaded from a dataset, for example:
    # df = pd.read_csv("path_to_your_dataset.csv")
    # For demonstration, let's assume df is already loaded with appropriate data
    df = pd.read_csv("nb.csv")
    # Pre-process the data to separate features and target
    X, y = pre_processing(df)

    # Train the Naive Bayes model
    features, likelihoods, class_priors, pred_priors = train_naive_bayes(X, y)

    # Convert X to a format suitable for prediction (if not already in such format)
    # Assuming X is the training data; replace with actual test data as needed
    X_test = np.array(X)

    # Predict the classes for the test data
    y_pred = predict_naive_bayes(
        X_test, features, likelihoods, class_priors, pred_priors)

    # Calculate and print the accuracy
    print("Accuracy:", accuracy_score(y, y_pred))

    # Example queries
    # Note: The query should be in the same format as your training data features
    query1 = np.array([['Rain', 'Mild', 'Normal', 'Strong']])
    query2 = np.array([['Overcast', 'Cool', 'Normal', 'Strong']])
    query3 = np.array([['Sunny', 'Hot', 'High', 'Strong']])

    # Predict and print the results for the queries
    print("Query 1 Prediction:", predict_naive_bayes(
        query1, features, likelihoods, class_priors, pred_priors))
    print("Query 2 Prediction:", predict_naive_bayes(
        query2, features, likelihoods, class_priors, pred_priors))
    print("Query 3 Prediction:", predict_naive_bayes(
        query3, features, likelihoods, class_priors, pred_priors))

Accuracy: 100.0
Query 1 Prediction: ['Yes']
Query 2 Prediction: ['Yes']
Query 3 Prediction: ['Yes']


In [11]:
import numpy as np
import pandas as pd

# Define accuracy score function


def accuracy_score(y_true, y_pred):
    return round(float(sum(y_pred == y_true)) / float(len(y_true)) * 100, 2)

# Pre-processing data


def pre_processing(df):
    X = df.drop([df.columns[-1]], axis=1)
    y = df[df.columns[-1]]
    return X, y

# Calculate class prior probabilities


def calc_class_prior(y):
    class_priors = {}
    unique_classes = np.unique(y)
    for outcome in unique_classes:
        outcome_count = sum(y == outcome)
        class_priors[outcome] = outcome_count / len(y)
    return class_priors

# Calculate likelihoods


def calc_likelihoods(X, y):
    likelihoods = {}
    features = X.columns
    for feature in features:
        likelihoods[feature] = {}
        for feat_val in np.unique(X[feature]):
            for outcome in np.unique(y):
                likelihoods[feature].update({feat_val + '_' + outcome: 0})

    for feature in features:
        for outcome in np.unique(y):
            outcome_count = sum(y == outcome)
            feat_likelihood = X[feature][y[y == outcome].index.values.tolist(
            )].value_counts().to_dict()

            for feat_val, count in feat_likelihood.items():
                likelihoods[feature][feat_val + '_' +
                                     outcome] = count / outcome_count
    return likelihoods

# Calculate predictor prior probabilities


def calc_predictor_prior(X):
    pred_priors = {}
    features = X.columns
    for feature in features:
        pred_priors[feature] = {}
        feat_vals = X[feature].value_counts().to_dict()
        for feat_val, count in feat_vals.items():
            pred_priors[feature][feat_val] = count / len(X)
    return pred_priors

# Predict function


def predict(X, features, class_priors, likelihoods, pred_priors):
    results = []
    for query in np.array(X):
        probs_outcome = {}
        for outcome in class_priors.keys():
            prior = class_priors[outcome]
            likelihood = 1
            evidence = 1

            for feat, feat_val in zip(features, query):
                likelihood *= likelihoods[feat].get(
                    feat_val + '_' + outcome, 1)
                evidence *= pred_priors[feat].get(feat_val, 1)

            posterior = (likelihood * prior) / evidence
            probs_outcome[outcome] = posterior

        result = max(probs_outcome, key=lambda x: probs_outcome[x])
        results.append(result)

    return np.array(results)


# Main execution block
if __name__ == "__main__":
    # Assuming weather dataset is loaded correctly
    # Update the path according to your dataset location
    df = pd.read_csv("nb.csv")
    df=df.drop(['ID'],axis=1)
    X, y = pre_processing(df)

    features = list(X.columns)
    class_priors = calc_class_prior(y)
    likelihoods = calc_likelihoods(X, y)
    pred_priors = calc_predictor_prior(X)

    y_pred = predict(X, features, class_priors, likelihoods, pred_priors)
    print("Train Accuracy: {}".format(accuracy_score(y, y_pred)))

    query1 = pd.DataFrame(
        [['Rain', 'Mild', 'Normal', 'Strong']], columns=features)
    print("Query 1: {} ---> {}".format(query1.values,
          predict(query1, features, class_priors, likelihoods, pred_priors)))
    query1 = pd.DataFrame(
        [['Overcast', 'Cool', 'Normal', 'Strong']], columns=features)
    print("Query 2: {} ---> {}".format(query1.values,
          predict(query2, features, class_priors, likelihoods, pred_priors)))
    query1 = pd.DataFrame(
        [['Sunny', 'Hot', 'High', 'Strong']], columns=features)
    print("Query 3: {} ---> {}".format(query1.values,
          predict(query3, features, class_priors, likelihoods, pred_priors)))

    

Train Accuracy: 92.86
Query 1: [['Rain' 'Mild' 'Normal' 'Strong']] ---> ['Yes']
Query 2: [['Overcast' 'Cool' 'Normal' 'Strong']] ---> ['Yes']
Query 3: [['Sunny' 'Hot' 'High' 'Strong']] ---> ['No']


In [None]:

        result = max(probs_outcome, key=lambda x: probs_outcome[x])
        results.append(result)

    return np.array(results)


# Main execution block
if __name__ == "__main__":
    # Assuming weather dataset is loaded correctly
    # Update the path according to your dataset location
    df = pd.read_csv("nb.csv")
    df=df.drop(['ID'],axis=1)
    X, y = pre_processing(df)

    features = list(X.columns)
    class_priors = calc_class_prior(y)
    likelihoods = calc_likelihoods(X, y)
    pred_priors = calc_predictor_prior(X)

    y_pred = predict(X, features, class_priors, likelihoods, pred_priors)
    print("Train Accuracy: {}".format(accuracy_score(y, y_pred)))

    query1 = pd.DataFrame(
        [['Rain', 'Mild', 'Normal', 'Strong']], columns=features)
    print("Query 1: {} ---> {}".format(query1.values,
          predict(query1, features, class_priors, likelihoods, pred_priors)))
    query1 = pd.DataFrame(
        [['Overcast', 'Cool', 'Normal', 'Strong']], columns=features)
    print("Query 2: {} ---> {}".format(query1.values,
          predict(query2, features, class_priors, likelihoods, pred_priors)))
    query1 = pd.DataFrame(
        [['Sunny', 'Hot', 'High', 'Strong']], columns=features)
    print("Query 3: {} ---> {}".format(query1.values,
          predict(query3, features, class_priors, likelihoods, pred_priors)))

    

In [4]:
df=pd.read_csv('nb.csv')
df.head()

Unnamed: 0,ID,Outlook,Temperature,Humidity,Wind,Play Ball
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes


In [8]:
X=df.drop(['ID','Play Ball'],axis=1)
Y=df['Play Ball']

Find P(Ci) of both classes// Prior Probability of Classes P(y)


In [13]:
def calc_class_prior():
    total=len(Y)
    class_counts=df['Play Ball'].value_counts()
    prior_prob=class_counts/total
    prior_prob.to_dict()
    return prior_prob

In [15]:
 class_priors = calc_class_prior()
 print(class_priors)

Yes    0.642857
No     0.357143
Name: Play Ball, dtype: float64


Find P(X/Ci) for all features /Likelihood table for all features

In [None]:
def calc_likelihoods():
    likelihoods={}
    for col in X.columns:
        likelihoods[col]={}
        unique_vals=X[col].unique()
        classes=Y.unique()

        for unique in unique_vals:
            for cls in classes:
                num=((X[col]==unique)&(Y==cls)).sum()
                denom=(Y==cls).sum()
                likelihood=num/denom if denom>0 else 0
                likelihoods[col][f"{value}|{cls}"] = likelihood

   return likelihoods
    

In [19]:
likelihoods=calc_likelihoods()
print(likelihoods)

<class 'NameError'>: name 'likelikhoods' is not defined