In [55]:
import numpy as np
import pandas as pd
import json
from os.path import exists

In [269]:
class NaiveBayes:
  def __init__(self, ngram = 1):
    self.ngram = ngram
    self.dataset = np.empty(0)
    self.words = np.empty(0)
    self.labels = np.empty(0)
    self.totalData = np.empty(0)
    self.frequentWord = np.empty(0)
    self.totalWord = np.empty(0)
    self.probabilityWord = np.empty(0)
    self.probabilityLabel = np.empty(0)
    self.model = np.empty(0)
  
  def addDataset(self, path):
    if exists(path):
      self.dataset = np.append(self.dataset, path)
    else:
      raise Exception('Dataset not found')
  
  def processDataset(self):
    for item in self.dataset:
      self.extractDataset(item)
  
  def extractDataset(self, path):
    data = pd.read_csv(path)
    data = pd.DataFrame(data)
    words = {}
    totalData = {}

    for index, row in data.iterrows():
      self.labels = np.append(self.labels, row[0])
      words[row[0].lower()] = words[row[0].lower()] + ' ' + row[1].lower() if row[0].lower() in words else row[1].lower()
    
    for label in self.labels:
      totalData[label] = int(data.groupby(['label']).size()[label])

    totalData['__total__'] = int(data.shape[0])

    self.labels = np.unique(self.labels)
    self.words = words
    self.totalData = totalData

    self.getFrequentData()
  
  def getFrequentData(self):
    frequentData = {}
    totalWord = {}
    
    for item in self.labels:
      frequentData[item] = {}
      for word in self.words[item].split():
        frequentData[item][word] = frequentData[item][word] + 1 if word in frequentData[item] else 1
      totalWord[item] = len(frequentData[item])
    
    self.frequentWord = frequentData
    self.totalWord = totalWord
    self.processWordAccumulative()

  def processWordAccumulative(self):
    for label in self.labels:
      for word in self.words[label].split():
        for checkLabel in self.labels:
          if word not in self.frequentWord[checkLabel]:
            self.frequentWord[checkLabel][word] = 0
    
    for label in self.labels:
      for word in self.frequentWord[label]:
        self.frequentWord[label][word] = self.frequentWord[label][word] + 1
    
    self.defineProbabilities()
  
  def defineProbabilities(self):
    probability = {}
    labelProbability = {}
    for label in self.labels:
      probability[label] = {}
      labelProbability[label] = self.countProbability(self.totalData[label], self.totalData['__total__'])
      for key, value in self.frequentWord[label].items():
        probability[label][key] = self.countProbability(value, self.totalWord[label])
    
    self.probabilityWord = probability
    self.probabilityLabel = labelProbability
  
  def saveModel(self, path, filename):
    data = {
      'totalData': self.totalData,
      'labels': dict(enumerate(self.labels.flatten(), 0)),
      'originalWord': self.words,
      'frequent': self.frequentWord,
      'probability': self.probabilityWord,
      'totalProbability': self.probabilityLabel,
      'totalWord': self.totalWord,
    }

    with open('./' + path + '/' + filename + '.json', 'w') as file:
      json.dump(data, file)

    print('Model berhasil disimpan di ' + path + '/' + filename + '.json')
  
  def loadModel(self, path):
    if not exists(path):
      raise Exception('Model not found')

    with open(path, 'r') as file:
      data = json.load(file)
      self.model = data
    
  def countProbability(self, frequent, total):
    return frequent / total
  
  def classify(self, text):
    detail = {}
    total = {}
    text = text.split()

    for index, label in self.model['labels'].items():
      detail[label] = {}
      total[label] = 1
      for word in text:
        detail[label][word] = self.model['probability'][label][word] if word in self.model['probability'][label] else 0
        total[label] = total[label] * detail[label][word]
      total[label] = total[label] * self.model['totalProbability'][label]
    
    ordering = ((value, key) for (key, value) in total.items())
    sortedOrdering = sorted(ordering, reverse=True)
    result = {k: v for v, k in sortedOrdering}
    
    return result, detail

  def debug(self):
    print('NGRAM')
    print(self.ngram)
    print('Dataset')
    print(self.dataset)
    print('Labels')
    print(self.labels)
    print('Words')
    print(self.words)
    print('Frequent word')
    print(self.frequentWord)

In [273]:
nb = NaiveBayes()
nb.addDataset('./../dataset/emot.csv')
nb.processDataset()
nb.saveModel('models', 'datamodel')
# nb.debug()

Model berhasil disimpan di models/datamodel.json


In [276]:
ts = NaiveBayes()
ts.loadModel('./models/datamodel.json')
result, detail = ts.classify('ini kok susah sekali ya')
print(result)
print(detail)


{'anger': 1.585988810930055e-12, 'sadness': 1.3497802752546083e-12, 'love': 5.381588066222028e-13, 'fear': 3.6097228682985025e-13, 'happy': 1.4040373788515135e-13}
{'anger': {'ini': 0.02176529588766299, 'kok': 0.008525576730190571, 'susah': 0.0011033099297893681, 'sekali': 0.001905717151454363, 'ya': 0.01624874623871615}, 'fear': {'ini': 0.022429316063812984, 'kok': 0.003632917390617596, 'susah': 0.001105670510187964, 'sekali': 0.001737482230295372, 'ya': 0.015637340072658348}, 'happy': {'ini': 0.02357358923542297, 'kok': 0.0014603108375925732, 'susah': 0.0008344633357671847, 'sekali': 0.0018775425054761655, 'ya': 0.011265255032856993}, 'love': {'ini': 0.017509025270758122, 'kok': 0.0034296028880866428, 'susah': 0.001263537906137184, 'sekali': 0.002888086642599278, 'ya': 0.01696750902527076}, 'sadness': {'ini': 0.020925597874224978, 'kok': 0.002435783879539415, 'susah': 0.002435783879539415, 'sekali': 0.003210806023029229, 'ya': 0.014946855624446413}}
