In [None]:
import numpy as np
import pandas as pd
from os.path import exists

In [70]:
class NaiveBayes:
  def __init__(self, ngram = 1):
    self.ngram = ngram
    self.dataset = np.empty(0)
    self.words = np.empty(0)
    self.labels = np.empty(0)
    self.frequentWord = np.empty(0)
  
  def addDataset(self, path):
    if exists(path):
      self.dataset = np.append(self.dataset, path)
      self.processDataset()
    else:
      raise Exception('Dataset not found')
  
  def processDataset(self):
    for item in self.dataset:
      self.extractDataset(item)
  
  def extractDataset(self, path):
    data = pd.read_csv(path)
    data = pd.DataFrame(data)
    words = {}

    for index, row in data.iterrows():
      self.labels = np.append(self.labels, row[0])
      words[row[0].lower()] = words[row[0].lower()] + ' ' + row[1].lower() if row[0].lower() in words else row[1].lower()
    
    self.labels = np.unique(self.labels)
    self.words = np.append(self.words, words)

    self.getFrequentData()
  
  def getFrequentData(self):
    frequentData = {}
    
    for item in self.labels:
      frequentData[item] = {}
      for data in self.words:
        for word in data[item].split():
          frequentData[item][word] = frequentData[item][word] + 1 if word in frequentData[item] else 1
    
    self.frequentWord = np.append(self.frequentWord, frequentData)

  def debug(self):
    print('NGRAM')
    print(self.ngram)
    print('Dataset')
    print(self.dataset)
    print('Labels')
    print(self.labels)
    print('Words')
    print(self.words)
    print('Frequent word')
    print(self.frequentWord)

In [71]:
nb = NaiveBayes()
nb.addDataset('./dataset/sample.csv')
nb.debug()


NGRAM
1
Dataset
['./dataset/sample.csv']
Labels
['negatif' 'positif']
Words
[{'positif': 'kamu memang yang terbaik buat aku memang kamu itu hebat ya luar biasa kamu ini', 'negatif': 'kamu sangat tidak baik dasar bisanya hanya merepotkan bisa tidak sih jangan dekat-dekat kamu ini tidak bisa apa-apa'}]
Frequent word
[{'negatif': {'kamu': 2, 'sangat': 1, 'tidak': 3, 'baik': 1, 'dasar': 1, 'bisanya': 1, 'hanya': 1, 'merepotkan': 1, 'bisa': 2, 'sih': 1, 'jangan': 1, 'dekat-dekat': 1, 'ini': 1, 'apa-apa': 1}, 'positif': {'kamu': 3, 'memang': 2, 'yang': 1, 'terbaik': 1, 'buat': 1, 'aku': 1, 'itu': 1, 'hebat': 1, 'ya': 1, 'luar': 1, 'biasa': 1, 'ini': 1}}]
