<a href="https://colab.research.google.com/github/hahajjjun/Machine_Learning_Toy_Projects/blob/main/Project_4_TGILAB_Naive_Bayes_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setting.py

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import random
from sklearn.datasets import load_iris
X,y = load_iris(return_X_y=True)
iris_df = pd.DataFrame(X)
iris_df['output'] = y
iris_df.columns = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'output']

### Split2Separate.py

In [3]:
# Train/Test Split
def splitDataset(dataset, splitRatio):
  trainSize = int(len(dataset) * splitRatio)
  index = random.sample(range(len(dataset)), trainSize)
  trainSet = dataset.iloc[index]
  testSet = dataset.drop(index, axis = 0)
  return trainSet, testSet

In [4]:
def separate_by_class(dataset):
  separated = dict()
  for i in range(len(dataset)):
    vector = dataset.iloc[i,:]
    target = vector[-1]
    if target not in separated:
      separated[target] = list()
    separated[target].append(vector)
  return separated

In [5]:
train_df, test_df = splitDataset(iris_df, 0.8)
train_df = train_df.sort_index(axis = 0)
test_df = test_df.sort_index(axis = 0)

### Stats.py

In [20]:
def mean(numbers):
  return sum(numbers)/len(numbers)

def stdev(numbers):
  avg = mean(numbers)
  variance = sum([(x-avg)**2 for x in numbers]) / (len(numbers) -1)
  return math.sqrt(variance)

# python asterisk : unpack containter-type data

def summarize(dataset, mode = "list"): # 데이터의 mean, stdev를 구함
  if mode == "df":
    summaries = [(mean(dataset[column]), stdev(dataset[column])) for column in dataset.columns]
    del summaries[-1] # output 열은 삭제
    return summaries
  elif mode == "list":
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)] 
    del summaries[-1] # output 열은 삭제
    return summaries

def summarizeByClass(dataset): # class별로 4개 feature(input)의 평균, 표준편차를 구함. 나중에 가우시안 분포를 가정할 때 사용됨
  separated = separate_by_class(dataset)
  summaries = {}
  for classValue, instances in separated.items():
    summaries[classValue] = summarize(instances, mode = "list")
  return summaries

In [48]:
# Create Gaussian PDF
def calculateProbability(x, mean, stdev):
  exponent = math.exp(-(x-mean)**2/(2* stdev**2))
  return (1/(math.sqrt(2*math.pi)*stdev))*exponent

def calculateClassProbabilities(summaries, inputVector):
  probabilities = {}
  for classValue, classSummaries in summaries.items():
    probabilities[classValue] = 1
    for i in range(len(classSummaries)):
      mean, stdev = classSummaries[i]
      x = inputVector[i]
      probabilities[classValue] *= calculateProbability(x, mean, stdev) # Naive Assumption applied
  return probabilities # inputVector x = (x1, x2, ..., xn)을 넣으면 class별로 P(X=x|Y=y)P(Y=y)=P(X=x^Y=y) 를 구해줌

### Run.py

In [81]:
def predict(summaries, inputVector):
  probabilities = calculateClassProbabilities(summaries, inputVector)
  bestLabel = None
  bestProb =  -1
  for classValue, probability in probabilities.items(): # probabilities = {'class1' : 0.01, 'class2' : 0.02, ... , 'classN' : 0.01} 이런 식으로 구성됨. argmax class를 찾는 과정
    if (bestLabel is None) or (probability > bestProb):
      bestProb = probability
      bestLabel = classValue
  return bestLabel

In [82]:
def getPredictions(summaries, testSet):
  predictions = []
  for i in range(len(testSet)):
    result = predict(summaries, testSet.iloc[i])
    predictions.append(result)
  return predictions

In [83]:
def getAccuracy(testSet, predictions):
	correct = 0
	for x in range(len(testSet)):
		if testSet.iloc[x][-1] == predictions[x]:
			correct += 1
	return (correct/float(len(testSet)))*100.0

In [87]:
#prepare model
summaries = summarizeByClass(train_df)
#test model
predictions = getPredictions(summaries, test_df)
accuracy = getAccuracy(test_df, predictions)

print('Accuracy: {0}%'.format(accuracy))

Accuracy: 96.66666666666667%


In [None]:
TFTable = pd.DataFrame()
TFTable['answer'] = test_df['output']
TFTable['prediction'] = predictions
TFTable