In [None]:
import pandas as pd
import numpy as np

### Create dummy dataset


In [None]:
def generate_random_dataset(filename,length = 15):
    columns = ['empid','department','job role', 'overtime', 'performance', 'attrition', 'job satisfaction']
    department_values = ['HR', 'Marketing', 'Finance', 'IT']
    job_role_values = ['HR','Marketing', 'Finance', 'Operations']
    overtime_values = ['Yes', 'No']
    performance_values = ['Excellent', 'Good', 'Bad']
    attrition_values = ['Yes', 'No']
    job_satisfaction_values = ['Very Satisfied','Satisfied', 'Neutral','Dissatisfied']


    df = pd.DataFrame(columns=columns)
    df['empid'] = np.arange(1, 1+length)
    df['department'] = np.random.choice(department_values, df.shape[0])
    df['job role'] = np.random.choice(job_role_values, df.shape[0])
    df['overtime'] = np.random.choice(overtime_values, df.shape[0])
    df['performance'] = np.random.choice(performance_values, df.shape[0])
    df['attrition'] = np.random.choice(attrition_values, df.shape[0])
    df['job satisfaction'] = np.random.choice(job_satisfaction_values, df.shape[0])
    
    df.to_csv(filename, index=False)


# generate_random_dataset('data.csv', 20)

### Naive Bayes Classifier

In [265]:
class NaiveBayesClassifier:
    def __init__(self, laplace_correction=1, debug=False):

        self.prior_probabilities = []
        self.conditional_probabilities = []
        self.features = 0
        self.x = []
        self.y = []
        self.laplace_correction = laplace_correction
        self.debug = debug

    def fit(self, x, y):

        self.x = x
        self.y = y
        prior_probabilities = np.unique(y, return_counts=True)
        self.prior_probabilities = {}
        for i in range(len(prior_probabilities[0])):
            self.prior_probabilities[prior_probabilities[0][i]] = (
                prior_probabilities[1][i]) / (len(y))

        self.conditional_probabilities = []
        self.features = len(x[0])
        for i in range(self.features):

            if self.debug:
                print(f'\nFeature {i}')

                
            feature_probability = {}
            unique_values = np.unique(x[:, i])
            for cond in np.unique(y):
                cond_indices = np.where(y == cond)[0]
                cond_count = len(cond_indices)

                feature_probability[cond] = {}
                for val in unique_values:
                    feature_probability[cond][val] = 0

                    cond_with_feature_count =len(np.where(x[cond_indices, i] == val)[0])
                    # add value for laplace correction
                    prob_with_laplace = (cond_with_feature_count+self.laplace_correction) / (cond_count+len(unique_values)*self.laplace_correction)
                    
                    if self.debug:
                        prob_without_laplace =  (cond_with_feature_count) / (cond_count)
                        print(f"\t - {val} | {cond} : [{prob_without_laplace} == {prob_with_laplace}]")
                        

                    feature_probability[cond][val] = prob_with_laplace
            self.conditional_probabilities.append(feature_probability)

        if self.debug:
            print("\n\n")
            print('Prior Probabilities', self.prior_probabilities)
            print('Conditional Probabilities', self.conditional_probabilities)

    def predict(self, x):
        results = []
        for i in range(len(x)):
            results.append(self.__predict(x[i]))
        return results
    
    def print_probabilities_table(self):
        print("Prior Probabilities")
        for(cond, prob) in self.prior_probabilities.items():
            print(f"{cond} : {prob}")
            
        print("\nConditional Probabilities")
        for i in range(self.features):
            print(f"Feature {i+1}")
            for(cond, prob) in self.conditional_probabilities[i].items():
                print(f"{cond} : {prob}")
            print("\n")

    def __predict(self, x):
        results = {}
        
        for cond in self.prior_probabilities:
           
            results[cond] = (self.prior_probabilities[cond])


            for j in range(self.features):
                results[cond] *= self.conditional_probabilities[j][cond][x[j]]


        return [max(results, key=results.get)]


### Helper Functions

In [266]:
def train_test_split(X, Y, frac):
    num_test = int(len(Y) * frac)
    indices = np.random.permutation(len(Y))

    test_indices = indices[:num_test]
    train_indices = indices[num_test:]
    
    return X[train_indices], X[test_indices], Y[train_indices], Y[test_indices]

def accuracy_score(Y_true, Y_pred):
    return np.sum(Y_true == Y_pred)/len(Y_true)


### Driver Code

In [267]:
df = pd.read_csv('data.csv')
df.head()


Unnamed: 0,empid,department,job role,overtime,performance,attrition,job satisfaction
0,1,Finance,Finance,Yes,Bad,No,Neutral
1,2,Marketing,Operations,Yes,Bad,Yes,Very Satisfied
2,3,HR,HR,No,Bad,No,Neutral
3,4,Marketing,Finance,Yes,Bad,Yes,Satisfied
4,5,Marketing,Finance,No,Good,Yes,Dissatisfied


In [268]:
X = df.iloc[:, 1:-1].values
Y = df.iloc[:, -1].values.reshape(-1, 1)
print(X[0], Y[0])

['Finance' 'Finance' 'Yes' 'Bad' 'No'] ['Neutral']


In [269]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 0.2)
print(f'Train Size: {len(X_train)}')
print(f'Test Size: {len(X_test)}')

Train Size: 16
Test Size: 4


### Fit the model

In [270]:
clf = NaiveBayesClassifier(laplace_correction=1, debug=True)
clf.fit(X_train, Y_train)


Feature 0
	 - Finance | Dissatisfied : [0.0 == 0.14285714285714285]
	 - HR | Dissatisfied : [0.3333333333333333 == 0.2857142857142857]
	 - IT | Dissatisfied : [0.0 == 0.14285714285714285]
	 - Marketing | Dissatisfied : [0.6666666666666666 == 0.42857142857142855]
	 - Finance | Neutral : [0.25 == 0.25]
	 - HR | Neutral : [0.5 == 0.375]
	 - IT | Neutral : [0.25 == 0.25]
	 - Marketing | Neutral : [0.0 == 0.125]
	 - Finance | Satisfied : [0.16666666666666666 == 0.2]
	 - HR | Satisfied : [0.3333333333333333 == 0.3]
	 - IT | Satisfied : [0.16666666666666666 == 0.2]
	 - Marketing | Satisfied : [0.3333333333333333 == 0.3]
	 - Finance | Very Satisfied : [0.6666666666666666 == 0.42857142857142855]
	 - HR | Very Satisfied : [0.0 == 0.14285714285714285]
	 - IT | Very Satisfied : [0.0 == 0.14285714285714285]
	 - Marketing | Very Satisfied : [0.3333333333333333 == 0.2857142857142857]

Feature 1
	 - Finance | Dissatisfied : [0.6666666666666666 == 0.42857142857142855]
	 - HR | Dissatisfied : [0.0 == 0

In [None]:
clf.print_probabilities_table()

### Test the model

In [None]:
y_pred = clf.predict(X_test)
print(list(y[0] for y in y_pred))
print(list(y[0] for y in Y_test))
acc = accuracy_score(Y_test, y_pred)
print(f'Accuracy: {acc}')


### Make Predictions

In [None]:
X_IN = np.array(['HR', 'HR', 'Yes', 'Excellent', 'No'])
print(clf.predict([X_IN]))


In [None]:
acc = accuracy_score(Y_test, y_pred)
print(f'Accuracy: {acc}')