In [60]:
import pickle
import random
from typing import List

from gensim.corpora import Dictionary
import numpy as np

In [61]:
with open(r"data/interim/positive_words.pkl", "rb") as input_file:
    positive_docs = pickle.load(input_file)

In [62]:
with open(r"data/interim/negative_words.pkl", "rb") as input_file:
    negative_docs = pickle.load(input_file)

In [63]:
negative_docs

[['conveniently',
  'locate',
  'bed',
  'breakfast',
  'place',
  'minute',
  'walking',
  'distance',
  'fact',
  'give',
  'central',
  'location',
  'attraction',
  'town',
  'accessible',
  'walk',
  'place',
  'cosy',
  'nook',
  'room',
  'say',
  'almost',
  'little',
  'lounge',
  'room',
  'tastefully',
  'do',
  'textile',
  'clean',
  'bed',
  'comfortable',
  'issue',
  'lack',
  'ventilation',
  'room',
  'window',
  'open',
  'step',
  'room',
  'compromise',
  'privacy',
  'mention',
  'room',
  'less',
  'people',
  'property',
  'fine',
  'highlight',
  'place',
  'breakfast',
  'cook'],
 ['good',
  'location',
  'town',
  'especially',
  'go',
  'spanish',
  'school',
  'good',
  'sized',
  'private',
  'twin',
  'room',
  'shared',
  'bathroom',
  'great',
  'price',
  'nice',
  'ish',
  'garden',
  'swing',
  'hamock',
  'basic',
  'bathroom',
  'tatty',
  'hot',
  'shower',
  'always',
  'keep',
  'water',
  'flow',
  'slow',
  'super',
  'basic',
  'pretty',
  'd

In [64]:
negative_words = [item for sublist in negative_docs for item in sublist]
positive_words = [item for sublist in positive_docs for item in sublist]

In [65]:
def create_dictionary(documents: List[List[str]]):
    return Dictionary(documents)

In [66]:
dictionary = create_dictionary([negative_words, positive_words])

In [67]:
len(dictionary)

13120

In [68]:
for entry in dictionary.items():
    print(entry)

(0, 'aa')
(1, 'aanduiding')
(2, 'aback')
(3, 'abandon')
(4, 'abandoned')
(5, 'abhor')
(6, 'abhorrent')
(7, 'ability')
(8, 'able')
(9, 'ably')
(10, 'abound')
(11, 'abrieron')
(12, 'abroad')
(13, 'abrupt')
(14, 'absence')
(15, 'absent')
(16, 'absolute')
(17, 'absolutely')
(18, 'absorbe')
(19, 'absoultely')
(20, 'abstract')
(21, 'absurd')
(22, 'absurdly')
(23, 'abundance')
(24, 'abundant')
(25, 'abuntante')
(26, 'abuse')
(27, 'abut')
(28, 'abysmal')
(29, 'ac')
(30, 'acatenango')
(31, 'acc')
(32, 'accent')
(33, 'accentuate')
(34, 'accept')
(35, 'acceptable')
(36, 'acceptance')
(37, 'accepted')
(38, 'acception')
(39, 'accesible')
(40, 'acceso')
(41, 'access')
(42, 'accessibility')
(43, 'accessible')
(44, 'accessory')
(45, 'accident')
(46, 'accidentally')
(47, 'acclaim')
(48, 'accoglierci')
(49, 'accomdate')
(50, 'accomidation')
(51, 'accommodate')
(52, 'accommodating')
(53, 'accommodation')
(54, 'accommodative')
(55, 'accomodate')
(56, 'accomodated')
(57, 'accomodating')
(58, 'accomodation'

In [69]:
def split_data(data: List, weights: List = (0.7, 0.15, 0.15)):
    split = {
        'train': [],
        'test': [],
        'validation': [],
    }
    for word in data:
        subset = random.choices(['train', 'test', 'validation'], weights=weights)[0]
        split[subset].append(word)

    return split

In [70]:
negative_docs_split = split_data(negative_docs)
positive_docs_split = split_data(positive_docs)

In [71]:
negative_words_split = {key: [item for sublist in value for item in sublist] for key, value in negative_docs_split.items()}
positive_words_split = {key: [item for sublist in value for item in sublist] for key, value in positive_docs_split.items()}

In [72]:
negative_words_split

{'train': ['conveniently',
  'locate',
  'bed',
  'breakfast',
  'place',
  'minute',
  'walking',
  'distance',
  'fact',
  'give',
  'central',
  'location',
  'attraction',
  'town',
  'accessible',
  'walk',
  'place',
  'cosy',
  'nook',
  'room',
  'say',
  'almost',
  'little',
  'lounge',
  'room',
  'tastefully',
  'do',
  'textile',
  'clean',
  'bed',
  'comfortable',
  'issue',
  'lack',
  'ventilation',
  'room',
  'window',
  'open',
  'step',
  'room',
  'compromise',
  'privacy',
  'mention',
  'room',
  'less',
  'people',
  'property',
  'fine',
  'highlight',
  'place',
  'breakfast',
  'cook',
  'good',
  'location',
  'town',
  'especially',
  'go',
  'spanish',
  'school',
  'good',
  'sized',
  'private',
  'twin',
  'room',
  'shared',
  'bathroom',
  'great',
  'price',
  'nice',
  'ish',
  'garden',
  'swing',
  'hamock',
  'basic',
  'bathroom',
  'tatty',
  'hot',
  'shower',
  'always',
  'keep',
  'water',
  'flow',
  'slow',
  'super',
  'basic',
  'prett

In [73]:
negative_bow = dictionary.doc2bow(negative_words_split['train'])
positive_bow = dictionary.doc2bow(positive_words_split['train'])

In [74]:
positive_bow

[(3, 1),
 (7, 10),
 (8, 226),
 (10, 4),
 (12, 6),
 (15, 1),
 (16, 51),
 (17, 374),
 (19, 1),
 (23, 7),
 (24, 12),
 (29, 16),
 (32, 8),
 (34, 12),
 (35, 9),
 (39, 4),
 (41, 130),
 (42, 2),
 (43, 24),
 (44, 4),
 (45, 7),
 (46, 2),
 (51, 232),
 (52, 119),
 (53, 167),
 (54, 2),
 (55, 36),
 (57, 8),
 (58, 29),
 (60, 10),
 (62, 6),
 (64, 1),
 (65, 1),
 (66, 9),
 (70, 11),
 (71, 1),
 (73, 1),
 (74, 1),
 (78, 1),
 (82, 2),
 (83, 2),
 (86, 2),
 (89, 2),
 (92, 10),
 (93, 3),
 (95, 34),
 (97, 40),
 (98, 1),
 (101, 181),
 (102, 13),
 (105, 128),
 (106, 3),
 (107, 2),
 (109, 144),
 (112, 60),
 (113, 45),
 (114, 14),
 (115, 9),
 (118, 41),
 (119, 5),
 (121, 1),
 (123, 16),
 (124, 6),
 (125, 8),
 (126, 3),
 (127, 2),
 (129, 4),
 (131, 2),
 (133, 7),
 (135, 9),
 (136, 3),
 (137, 1),
 (138, 4),
 (139, 21),
 (140, 3),
 (141, 7),
 (143, 33),
 (144, 30),
 (145, 1),
 (146, 33),
 (147, 46),
 (148, 3),
 (149, 2),
 (150, 9),
 (152, 3),
 (153, 1),
 (154, 88),
 (155, 1),
 (156, 35),
 (157, 27),
 (161, 8),
 (162

In [75]:
total_negative_words = len(negative_words_split['train']) + len(dictionary)
total_positive_words = len(positive_words_split['train']) + len(dictionary)

In [76]:
total_positive_words

283471

In [77]:
total_negative_words

173707

In [78]:
negative_word_probs = {}
for id, count in negative_bow:
    negative_word_probs[dictionary[id]] = {
        'id': id,
        'logprob': np.log((count + 1)/total_negative_words),
    }

negative_word_probs[-1] = {
    'id': -1,
    'logprob': np.log(1/total_negative_words)
}

In [79]:
positive_word_probs = {}
for id, count in positive_bow:
    positive_word_probs[dictionary[id]] = {
        'id': id,
        'logprob': np.log((count + 1)/total_positive_words),
    }
positive_word_probs[-1] = {
    'id': -1,
    'logprob': np.log(1/total_positive_words)
}

In [80]:
negative_prob = len(negative_docs_split["train"]) / (len(negative_docs_split["train"]) + len(positive_docs_split["train"]))
positive_prob = len(positive_docs_split["train"]) / (len(negative_docs_split["train"]) + len(positive_docs_split["train"]))

In [81]:
negative_prob

0.34760476320334516

In [82]:
positive_prob

0.6523952367966549

In [83]:
negative_word_probs

{'aa': {'id': 0, 'logprob': -11.371978070223978},
 'aback': {'id': 2, 'logprob': -11.371978070223978},
 'abandon': {'id': 3, 'logprob': -10.966512962115813},
 'abandoned': {'id': 4, 'logprob': -11.371978070223978},
 'abhorrent': {'id': 6, 'logprob': -11.371978070223978},
 'ability': {'id': 7, 'logprob': -10.273365781555867},
 'able': {'id': 8, 'logprob': -7.081518629075586},
 'ably': {'id': 9, 'logprob': -10.966512962115813},
 'abound': {'id': 10, 'logprob': -11.371978070223978},
 'abrieron': {'id': 11, 'logprob': -11.371978070223978},
 'abroad': {'id': 12, 'logprob': -10.455687338349822},
 'abrupt': {'id': 13, 'logprob': -11.371978070223978},
 'absence': {'id': 14, 'logprob': -10.678830889664033},
 'absent': {'id': 15, 'logprob': -11.371978070223978},
 'absolute': {'id': 16, 'logprob': -10.455687338349822},
 'absolutely': {'id': 17, 'logprob': -7.734391910497592},
 'abstract': {'id': 20, 'logprob': -11.371978070223978},
 'absurd': {'id': 21, 'logprob': -11.371978070223978},
 'abundanc

In [84]:
positive_word_probs

{'abandon': {'id': 3, 'logprob': -11.861717923588778},
 'ability': {'id': 7, 'logprob': -10.156969831350352},
 'able': {'id': 8, 'logprob': -7.12991508666732},
 'abound': {'id': 10, 'logprob': -10.945427191714622},
 'abroad': {'id': 12, 'logprob': -10.60895495509341},
 'absent': {'id': 15, 'logprob': -11.861717923588778},
 'absolute': {'id': 16, 'logprob': -8.603621385567294},
 'absolutely': {'id': 17, 'logprob': -6.627939078178311},
 'absoultely': {'id': 19, 'logprob': -11.861717923588778},
 'abundance': {'id': 23, 'logprob': -10.475423562468887},
 'abundant': {'id': 24, 'logprob': -9.989915746687185},
 'ac': {'id': 29, 'logprob': -9.721651760092506},
 'accent': {'id': 32, 'logprob': -10.357640526812503},
 'accept': {'id': 34, 'logprob': -9.989915746687185},
 'acceptable': {'id': 35, 'logprob': -10.252280011154676},
 'accesible': {'id': 39, 'logprob': -10.945427191714622},
 'access': {'id': 41, 'logprob': -7.679667780947571},
 'accessibility': {'id': 42, 'logprob': -11.456252815480612

In [85]:
positive_word_probs["abandon"]

{'id': 3, 'logprob': -11.861717923588778}

In [86]:
model = {
    'POS_PROB': np.log(positive_prob),
    'NEG_PROB': np.log(negative_prob),
    'COND_POS_PROBS': positive_word_probs,
    'COND_NEG_PROBS': negative_word_probs
}

In [87]:
model

{'POS_PROB': -0.42710470930936373,
 'NEG_PROB': -1.0566891825681437,
 'COND_POS_PROBS': {'abandon': {'id': 3, 'logprob': -11.861717923588778},
  'ability': {'id': 7, 'logprob': -10.156969831350352},
  'able': {'id': 8, 'logprob': -7.12991508666732},
  'abound': {'id': 10, 'logprob': -10.945427191714622},
  'abroad': {'id': 12, 'logprob': -10.60895495509341},
  'absent': {'id': 15, 'logprob': -11.861717923588778},
  'absolute': {'id': 16, 'logprob': -8.603621385567294},
  'absolutely': {'id': 17, 'logprob': -6.627939078178311},
  'absoultely': {'id': 19, 'logprob': -11.861717923588778},
  'abundance': {'id': 23, 'logprob': -10.475423562468887},
  'abundant': {'id': 24, 'logprob': -9.989915746687185},
  'ac': {'id': 29, 'logprob': -9.721651760092506},
  'accent': {'id': 32, 'logprob': -10.357640526812503},
  'accept': {'id': 34, 'logprob': -9.989915746687185},
  'acceptable': {'id': 35, 'logprob': -10.252280011154676},
  'accesible': {'id': 39, 'logprob': -10.945427191714622},
  'access'

In [88]:
with open(r"models/model.pkl", "wb") as output_file:
    pickle.dump(model, output_file)

"The staff was super friendly, the food was superb ..."

review -----> tokenize

`["staff", "be", "friendly", "food" "be", "superb"]`

$\log P(\text{staff}|+) + \log P(\text{be}|+) + \log P(\text{friendly}|+)+ \log P(\text{food}|+) + \log P(\text{food}|+) + \log P(\text{superb}|+)+ \log P(+) = -203.42$
$\log P(\text{staff}|-) + \log P(\text{be}|-) + \log P(\text{friendly}|-)+ \log P(\text{food}|-) + \log P(\text{food}|-) + \log P(\text{superb}|-)+ \log P(-) = -400.47$