In [1]:
import numpy as np
import pandas as pd
import json
from os.path import exists
import helpers.stringHelper as shp
import re
from progress.bar import Bar


In [8]:
class NaiveBayes:
  def __init__(self, ngram = 1):
    self.ngram = ngram
    self.dataset = np.empty(0)
    self.words = np.empty(0)
    self.labels = np.empty(0)
    self.totalData = np.empty(0)
    self.frequentWord = np.empty(0)
    self.totalWord = np.empty(0)
    self.probabilityWord = np.empty(0)
    self.probabilityLabel = np.empty(0)
    self.model = np.empty(0)
  
  def addDataset(self, path):
    if exists(path):
      self.dataset = np.append(self.dataset, path)
    else:
      raise Exception('Dataset not found')
  
  def processDataset(self):
    for item in self.dataset:
      self.extractDataset(item)
  
  def extractDataset(self, path):
    data = pd.read_csv(path)
    data = pd.DataFrame(data)
    words = {}
    totalData = {}
    bar = Bar('Memproses data', max=479)

    for index, row in data.iterrows():
      self.labels = np.append(self.labels, row[0])
      text = re.sub('[\W_]|(URL)|(USERNAME)', ' ', row[1])
      text = shp.stemString(text)
      words[row[0].lower()] = words[row[0].lower()] + ' ' + text.lower() if row[0].lower() in words else text.lower()
      bar.next()
    bar.finish()
    
    for label in self.labels:
      totalData[label] = int(data.groupby(['label']).size()[label])

    totalData['__total__'] = int(data.shape[0])

    self.labels = np.unique(self.labels)
    self.words = words
    self.totalData = totalData

    self.getFrequentData()
  
  def getFrequentData(self):
    frequentData = {}
    totalWord = {}
    
    for item in self.labels:
      frequentData[item] = {}
      for word in self.words[item].split():
        frequentData[item][word] = frequentData[item][word] + 1 if word in frequentData[item] else 1
      totalWord[item] = len(frequentData[item])
    
    self.frequentWord = frequentData
    self.totalWord = totalWord
    self.processWordAccumulative()

  def processWordAccumulative(self):
    for label in self.labels:
      for word in self.words[label].split():
        for checkLabel in self.labels:
          if word not in self.frequentWord[checkLabel]:
            self.frequentWord[checkLabel][word] = 0
    
    for label in self.labels:
      for word in self.frequentWord[label]:
        self.frequentWord[label][word] = self.frequentWord[label][word] + 1
    
    self.defineProbabilities()
  
  def defineProbabilities(self):
    probability = {}
    labelProbability = {}
    for label in self.labels:
      probability[label] = {}
      labelProbability[label] = self.countProbability(self.totalData[label], self.totalData['__total__'])
      for key, value in self.frequentWord[label].items():
        probability[label][key] = self.countProbability(value, self.totalWord[label])
    
    self.probabilityWord = probability
    self.probabilityLabel = labelProbability
  
  def saveModel(self, path, filename):
    data = {
      'totalData': self.totalData,
      'labels': dict(enumerate(self.labels.flatten(), 0)),
      'originalWord': self.words,
      'frequent': self.frequentWord,
      'probability': self.probabilityWord,
      'totalProbability': self.probabilityLabel,
      'totalWord': self.totalWord,
    }

    with open('./' + path + '/' + filename + '.json', 'w') as file:
      json.dump(data, file)

    print('Model berhasil disimpan di ' + path + '/' + filename + '.json')
  
  def loadModel(self, path):
    if not exists(path):
      raise Exception('Model not found')

    with open(path, 'r') as file:
      data = json.load(file)
      self.model = data
    
  def countProbability(self, frequent, total):
    return frequent / total
  
  def classify(self, text):
    detail = {}
    total = {}
    text = re.sub('[\W_]|(URL)|(USERNAME)', ' ', text)
    text = shp.stemString(text)
    text = text.split()

    for index, label in self.model['labels'].items():
      detail[label] = {}
      total[label] = self.model['totalProbability'][label]
      for word in text:
        if word in self.model['probability'][label]:
          detail[label][word] = self.model['probability'][label][word]
          total[label] = total[label] * detail[label][word]
    
    ordering = ((value, key) for (key, value) in total.items())
    sortedOrdering = sorted(ordering, reverse=True)
    percent = {k: v for v, k in sortedOrdering}
    result = list(percent.keys())[0]

    return result, percent, detail
  
  def testing(self, path):
    if not exists(path):
      raise Exception('Path tidak ditemukan')
    
    data = pd.read_csv(path)
    data = pd.DataFrame(data)
    correct = 0
    miss = 0
    bar = Bar('Testing data', max=479)

    for index, row in data.iterrows():
      text = re.sub('[\W_]|(URL)|(USERNAME)', ' ', row[1])
      text = shp.stemString(text)
      result, percent, detail = self.classify(text)
      if result == row[0]:
        correct += 1
      else:
        miss += 1
      bar.next()
    bar.finish()

    print('Correct ' + str(correct))
    print('Miss ' + str(miss))
    print('Correct Percentage ' + str(correct / len(data.index) * 100))
    print('Miss Percentage ' + str(miss / len(data.index) * 100))
    
  def debug(self):
    print('NGRAM')
    print(self.ngram)
    print('Dataset')
    print(self.dataset)
    print('Labels')
    print(self.labels)
    print('Words')
    print(self.words)
    print('Frequent word')
    print(self.frequentWord)

In [3]:
nb = NaiveBayes()
nb.addDataset('./../dataset/emot.csv')
nb.processDataset()
nb.saveModel('models', 'datamodelemot')
# nb.debug()

Model berhasil disimpan di models/datamodelemot.json


In [149]:
ts = NaiveBayes()
ts.loadModel('./models/datamodelemot.json')
text = 'Rabu, 06-06-2018, anakbujangku selesai menempuh pendidikan di TK Bhakti Ibu, Bakauheni. Eit, ternyata ia dapat piala jugak'
result, percent, detail = ts.classify(text)
print(result)
print(percent)


love
{'love': 4.1095803055357712e-50, 'happy': 1.6976075842492336e-54, 'fear': 3.1185305690421436e-55, 'sadness': 4.486274788145623e-57, 'anger': 6.85941977344497e-61}


In [9]:
ts = NaiveBayes()
ts.loadModel('./models/datamodelemot.json')
path = './../dataset/emot.csv'
ts.testing(path)

Correct 325
Miss 424
Correct Percentage 43.39118825100133
Miss Percentage 56.60881174899867
