In [385]:
import numpy as np
import pandas as pd
import json
from os.path import exists
import helpers.stringHelper as shp
import re

In [386]:
class NaiveBayes:
  def __init__(self, ngram = 1):
    self.ngram = ngram
    self.dataset = np.empty(0)
    self.words = np.empty(0)
    self.labels = np.empty(0)
    self.totalData = np.empty(0)
    self.frequentWord = np.empty(0)
    self.totalWord = np.empty(0)
    self.probabilityWord = np.empty(0)
    self.probabilityLabel = np.empty(0)
    self.model = np.empty(0)
  
  def addDataset(self, path):
    if exists(path):
      self.dataset = np.append(self.dataset, path)
    else:
      raise Exception('Dataset not found')
  
  def processDataset(self):
    for item in self.dataset:
      self.extractDataset(item)
  
  def extractDataset(self, path):
    data = pd.read_csv(path)
    data = pd.DataFrame(data)
    words = {}
    totalData = {}

    for index, row in data.iterrows():
      self.labels = np.append(self.labels, row[0])
      text = re.sub('[\W_]+', ' ', row[1])
      text = shp.stemString(text)
      words[row[0].lower()] = words[row[0].lower()] + ' ' + text.lower() if row[0].lower() in words else text.lower()
    
    for label in self.labels:
      totalData[label] = int(data.groupby(['label']).size()[label])

    totalData['__total__'] = int(data.shape[0])

    self.labels = np.unique(self.labels)
    self.words = words
    self.totalData = totalData

    self.getFrequentData()
  
  def getFrequentData(self):
    frequentData = {}
    totalWord = {}
    
    for item in self.labels:
      frequentData[item] = {}
      for word in self.words[item].split():
        frequentData[item][word] = frequentData[item][word] + 1 if word in frequentData[item] else 1
      totalWord[item] = len(frequentData[item])
    
    self.frequentWord = frequentData
    self.totalWord = totalWord
    self.processWordAccumulative()

  def processWordAccumulative(self):
    for label in self.labels:
      for word in self.words[label].split():
        for checkLabel in self.labels:
          if word not in self.frequentWord[checkLabel]:
            self.frequentWord[checkLabel][word] = 0
    
    for label in self.labels:
      for word in self.frequentWord[label]:
        self.frequentWord[label][word] = self.frequentWord[label][word] + 1
    
    self.defineProbabilities()
  
  def defineProbabilities(self):
    probability = {}
    labelProbability = {}
    for label in self.labels:
      probability[label] = {}
      labelProbability[label] = self.countProbability(self.totalData[label], self.totalData['__total__'])
      for key, value in self.frequentWord[label].items():
        probability[label][key] = self.countProbability(value, self.totalWord[label])
    
    self.probabilityWord = probability
    self.probabilityLabel = labelProbability
  
  def saveModel(self, path, filename):
    data = {
      'totalData': self.totalData,
      'labels': dict(enumerate(self.labels.flatten(), 0)),
      'originalWord': self.words,
      'frequent': self.frequentWord,
      'probability': self.probabilityWord,
      'totalProbability': self.probabilityLabel,
      'totalWord': self.totalWord,
    }

    with open('./' + path + '/' + filename + '.json', 'w') as file:
      json.dump(data, file)

    print('Model berhasil disimpan di ' + path + '/' + filename + '.json')
  
  def loadModel(self, path):
    if not exists(path):
      raise Exception('Model not found')

    with open(path, 'r') as file:
      data = json.load(file)
      self.model = data
    
  def countProbability(self, frequent, total):
    return frequent / total
  
  def classify(self, text):
    detail = {}
    total = {}
    text = re.sub('[\W_]+', ' ', text)
    text = shp.stemString(text)
    text = text.split()

    for index, label in self.model['labels'].items():
      detail[label] = {}
      total[label] = 1
      for word in text:
        detail[label][word] = self.model['probability'][label][word] if word in self.model['probability'][label] else 0
        total[label] = total[label] * detail[label][word]
      total[label] = total[label] * self.model['totalProbability'][label]
    
    ordering = ((value, key) for (key, value) in total.items())
    sortedOrdering = sorted(ordering, reverse=True)
    percent = {k: v for v, k in sortedOrdering}
    result = list(percent.keys())[0]

    return result, percent, detail
    
  def debug(self):
    print('NGRAM')
    print(self.ngram)
    print('Dataset')
    print(self.dataset)
    print('Labels')
    print(self.labels)
    print('Words')
    print(self.words)
    print('Frequent word')
    print(self.frequentWord)

In [387]:
nb = NaiveBayes()
nb.addDataset('./../dataset/emot.csv')
nb.processDataset()
nb.saveModel('models', 'datamodelemot')
# nb.debug()

soal jln jatibaru polisi tdk bs gertak gubernur emangny polisi tdk ikut pmbhasan jgn politik atur wilayah hak gubernur soal tn abang soal turun turun pelik perlu sabar username username url
sama cewe lho kayak harus bisa lebih rasain lah yang harus sibuk jaga diri rasain sakit haid dan panik pulang malem sendiri gimana orang asing wajar banyak korban yang takut curhat bukan bela malah hujat
kepingin gudeg mbarek bu hj amad foto dari google sengaja biar teman teman jg bayang bagi itu indah
jln jatibaru bagi dari wilayah tn abang atur wilayah tgg jwb dan wwnang gub tng abng soal rumit sejak gub2 trdahulu skrg sedng benah agr bermnfaat semua pihak mohon yg punya otak pikir dgn wajar kecuali otak butek ya kamu url
sharing alam aja kemarin jam 18 00 batalin tiket di stasiun pasar senen lancar antri tidak terlalu rame 15 menit dan beress semua mungkin bisa coba twips di jam jam segitu cc username
dari sekian banyak thread yang aku baca thread ini paling aneh sih dalam tulis sumpah aneh bgt m

In [394]:
ts = NaiveBayes()
ts.loadModel('./models/datamodelemot.json')
result, percent, detail = ts.classify('hari ini senang')
print(result)
print(percent)
print(detail)


hari ini senang
happy
{'happy': 1.6907036477546495e-07, 'sadness': 7.862507285711095e-08, 'anger': 3.1443776469771614e-08, 'fear': 8.084519123898761e-09, 'love': 5.464811755383808e-09}
{'anger': {'hari': 0.00472212132219397, 'ini': 0.022884126407555393, 'senang': 0.000726480203414457}, 'fear': {'hari': 0.004608294930875576, 'ini': 0.020737327188940093, 'senang': 0.001152073732718894}, 'happy': {'hari': 0.010344827586206896, 'ini': 0.02807881773399015, 'senang': 0.0024630541871921183}, 'love': {'hari': 0.007075471698113208, 'ini': 0.009433962264150943, 'senang': 0.0023584905660377358}, 'sadness': {'hari': 0.006809848088004191, 'ini': 0.028810895756940808, 'senang': 0.001571503404924044}}
