In [55]:
import numpy as np
import pandas as pd
import json
from os.path import exists

In [69]:
class NaiveBayes:
  def __init__(self, ngram = 1):
    self.ngram = ngram
    self.dataset = np.empty(0)
    self.words = np.empty(0)
    self.labels = np.empty(0)
    self.frequentWord = np.empty(0)
  
  def addDataset(self, path):
    if exists(path):
      self.dataset = np.append(self.dataset, path)
    else:
      raise Exception('Dataset not found')
  
  def processDataset(self):
    for item in self.dataset:
      self.extractDataset(item)
  
  def extractDataset(self, path):
    data = pd.read_csv(path)
    data = pd.DataFrame(data)
    words = {}

    for index, row in data.iterrows():
      self.labels = np.append(self.labels, row[0])
      words[row[0].lower()] = words[row[0].lower()] + ' ' + row[1].lower() if row[0].lower() in words else row[1].lower()
    
    self.labels = np.unique(self.labels)
    self.words = words

    self.getFrequentData()
  
  def getFrequentData(self):
    frequentData = {}
    
    for item in self.labels:
      frequentData[item] = {}
      for word in self.words[item].split():
        frequentData[item][word] = frequentData[item][word] + 1 if word in frequentData[item] else 1
    
    self.frequentWord = frequentData
    self.processWordAccumulative()

  def processWordAccumulative(self):
    for label in self.labels:
      for word in self.words[label].split():
        for checkLabel in self.labels:
          if word not in self.frequentWord[checkLabel]:
            self.frequentWord[checkLabel][word] = 0
  
  def saveModel(self, path, filename):
    data = {
      'labels': dict(enumerate(self.labels.flatten(), 0)),
      'originalWord': self.words,
      'frequent': self.frequentWord
    }

    with open('./' + path + '/' + filename + '.json', 'w') as file:
      json.dump(data, file)

    print('Model berhasil disimpan di ' + path + '/' + filename + '.json')

  def debug(self):
    print('NGRAM')
    print(self.ngram)
    print('Dataset')
    print(self.dataset)
    print('Labels')
    print(self.labels)
    print('Words')
    print(self.words)
    print('Frequent word')
    print(self.frequentWord)

In [70]:
nb = NaiveBayes()
nb.addDataset('./../dataset/sample.csv')
nb.processDataset()
nb.saveModel('models', 'datamodel')
# nb.debug()

Model berhasil disimpan di models/datamodel.json
