### **load data pos & neg**

In [1]:
import os

In [2]:
path_pos = "./data/pos/"
path_neg = "./data/neg/"

In [3]:
list_pos = os.listdir(path_pos)
list_neg = os.listdir(path_neg)

In [4]:
word_pos=[]
word_neg=[]

In [5]:
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

### **extract pos & neg Stemmed words**

In [6]:
for p in list_pos:
    f = open(os.path.join(path_pos, p), 'r')
    lines = f.readlines()
    for l in lines:
        lw = wordpunct_tokenize(l)     
        for w in lw:
            word_pos.append(stemmer.stem(w))
    

In [7]:
len(word_pos)

832564

In [8]:
for n in list_neg:
    f = open(os.path.join(path_neg, n), 'r')
    lines = f.readlines()
    for l in lines:
        lw = wordpunct_tokenize(l)
        for w in lw:
            word_neg.append(stemmer.stem(w))

In [9]:
len(word_neg)

751256

### **remove stop words and punctuation**

In [10]:
from nltk.corpus import stopwords # Text data
import string

In [11]:
stop_words = stopwords.words("english")
stop_words.append(list(string.punctuation))
stop_words = (stop_words)
filtered_word_pos = [ word for word in word_pos if word not in stop_words and not w.isnumeric()]
filtered_word_neg = [ word for word in word_neg if word not in stop_words and not w.isnumeric()]

In [12]:
n_pos = len(filtered_word_pos)
n_neg = len(filtered_word_neg)
n_pos,n_neg

(523864, 472592)

### **extract word freqs in pos&neg**

In [13]:
from collections import Counter
import pandas as pd
import numpy as np

In [14]:
vocap1 = set(filtered_word_pos)
vocap2 = set(filtered_word_neg)
vocap = vocap1
vocap = vocap.union(vocap2)
len(vocap),len(vocap1),len(vocap2)

(26109, 20148, 19000)

In [15]:
x_pos=Counter(filtered_word_pos)
np.log(x_pos["fun"])-np.log(n_pos)

-7.357846394654895

In [16]:
x_neg=Counter(filtered_word_neg)
np.log(x_neg["fun"]/n_neg)

-7.5445267980950605

### **get probability of words**

In [17]:
logprior = np.log(n_pos/n_neg)

In [18]:
vocap_prob = {}
for v in vocap: 
  vocap_prob[v] = np.log((x_pos.get(v,0)+1)/(n_pos+len(vocap1))) - np.log((x_neg.get(v,0)+1)/(n_neg+len(vocap2))) 

In [19]:
vocap_prob,logprior

({'tolkien': 1.2849721598501755,
  'propaganda': 0.09934849419243719,
  'doreen': -0.7944693818296606,
  'suck': -0.9342313242048199,
  '1994': -0.01431082428008601,
  'gingerbread': 2.463627156191821,
  'pixi': -0.506787309377879,
  'lazard': -0.9122524174860427,
  'howlingli': -0.7944693818296606,
  'forti': -0.0500289068821651,
  'awkward': -0.021279493596178867,
  'frail': 0.5918249792902301,
  'pratt': 0.30414290683844847,
  'pen': -0.375759046971476,
  'lambskin': -0.7944693818296606,
  'intak': -0.7944693818296606,
  'chic': -0.6121478250357058,
  'ebb': 0.5918249792902301,
  'maximum': 0.3911542838280777,
  'gander': -0.7944693818296606,
  'trevil': -0.7944693818296606,
  'hotcak': -0.10132220126971525,
  'hutt': 2.2012628917243298,
  'unhappi': 0.7588790639533958,
  'barri': -0.0731513243030193,
  'walki': 0.5918249792902301,
  'preming': 0.9972900873983939,
  'humil': 0.5918249792902301,
  '_pollock_': 1.9781193404101192,
  'landmark': 0.2351500353514986,
  'drabbi': -1.19993

### **Test**

In [27]:
acc_NB_pos = 0
for p in list_pos:
    f = open(path_pos+p,'r')
    doc_pos = f.readlines()
    for l in doc_pos:
        lw = wordpunct_tokenize(l)
        words = []
        for w in lw:
            words.append(stemmer.stem(w))
        filtered_words = [ word for word in words if word not in stop_words and not w.isnumeric()]
        sum = logprior
        for w in filtered_words:
            sum += vocap_prob.get(w, 0)

    acc_NB_pos += 1 if sum > 0 else 0
    
acc_NB_pos/10

71.5

In [28]:
acc_NB_neg = 0
for n in list_neg:
    f = open(path_neg+n,'r')
    doc_pos = f.readlines()
    for l in doc_pos:
        lw = wordpunct_tokenize(l)
        words = []
        for w in lw:
            words.append(stemmer.stem(w))
        filtered_words = [ word for word in words if word not in stop_words and not w.isnumeric()]
        sum = logprior
        for w in filtered_words:
            sum += vocap_prob.get(w, 0)

    acc_NB_neg += 1 if sum < 0 else 0
    
acc_NB_neg/10

79.0

In [30]:
acc_NB_pos/1000,acc_NB_neg/1000

(0.715, 0.79)