In [10]:
from typing import Union, List
import numpy as np
import matplotlib.pyplot as plt
import string
import random
import re
import requests
import os
import textwrap
import nltk
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

## Building Trigram Model

output should be like:

{
    ("I", "eggs"): {
        "like": 0.67,
        "love": 0.21,
        ...
    }
}

In [2]:
# basic corpus to test
basic_corpus = """
    I like cats
    I like pizzas
    I want water
    He wants water
"""



In [3]:
!ls ../sentiment-analysis/electronics/

negative.review  positive.review  unlabeled.review


In [4]:
positive_reviews = BeautifulSoup(open("../sentiment-analysis/electronics/positive.review").read(), features="html5lib").findAll("review_text")

In [29]:
positive_reviews[0]

<review_text>
I purchased this unit due to frequent blackouts in my area and 2 power supplies going bad.  It will run my cable modem, router, PC, and LCD monitor for 5 minutes.  This is more than enough time to save work and shut down.   Equally important, I know that my electronics are receiving clean power.

I feel that this investment is minor compared to the loss of valuable data or the failure of equipment due to a power spike or an irregular power supply.

As always, Amazon had it to me in &lt;2 business days
</review_text>

## Language Model

In [6]:
# leveraging ord function to get integers from a character to use as index
ord("a"), ord("b"), ord("c")

(97, 98, 99)

markov matrix to store trigram probabilities

we initialize with ones to consider "add-one smoothing"

On the cipher exercise, we were using unigram and bigram calculations. For unigrams, it is easy: we just count words. For bigrams, we needed the unigram prob, so for $n-gram$ models we need all previous probabilities.

In general, a trigram and $n-gram$ model have the following probabilities:

$$p(w_t| w_{t-1}, w_{t-2}) = \frac{p(w_{t-2}\rightarrow w_{t-1} \rightarrow w_{t})}{p(w_{t-2}\rightarrow w_{t-1})} $$

$$p(w_t| w_{t-1}, w_{t-2}, ...,w_{t-N+1}) = \frac{p(w_{t-N+1}\rightarrow ... \rightarrow w_{t-1} \rightarrow w_{t})}{p(w_{t-N+1}\rightarrow ... \rightarrow w_{t-1} )} $$

Hence, the general formula is recursive.

In [5]:
# define which n-gram we want
ngram = 3 #trigram

M_2 = np.ones((26,26))

# initial state distribution (unigrams probabilities)
pi = np.zeros(26)

def update_trigrams(w1, w2, w3):
    pass

def update_bigrams(w1, w2):
    i = ord(w1) - 97
    j = ord(w2) - 97
    M[i,j] += 1
    
def update_unigrams(w):
    i = ord(w) - 97
    pi[i] += 1
    
# get log-probability of a word/token
def get_word_prob(word):
    
    probs = []
    # first word index
    i = ord(word[0]) - 97
    probs.append(np.log(pi[i]))
    
    # rest of sentence
    for w_previous, w in zip(word, word[1:]):
        i = ord(w_previous) - 97
        j = ord(w) - 97
        probs.append(np.log(M[i,j]))
        
    # find log-probability
    return sum(probs)

# get log-probability of a document, which is a sequence of words
def get_sequence_prob(doc:Union[str, List]):
    
    if type(doc) == str:
        doc = doc.split()
        
    prob = 0
    for word in doc:
        prob += get_word_prob(word)
        
    return prob

In [8]:
l = [1,2,4,5]
l[:-2]

[1, 2]

In [19]:
regex = re.compile('[^a-zA-Z]')

trigrams = {}
lemmatizer = WordNetLemmatizer()

with open("../sentiment-analysis/stopwords.txt") as f:
    stopwords = set(w.rstrip for w in f)

def my_tokenizer(sentence:str):
    s = sentence.lower()
    s = regex.sub(' ', s) 
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stopwords]

    return tokens

for review in positive_reviews:
    tokens = my_tokenizer(review.text)
    if tokens:
        # update our language model 
        for i, token in enumerate(tokens[:-2]):
            # update first unigram probs
            k = (tokens[i], tokens[i+2])
            if k not in trigrams:
                trigrams[k] = []
            # update the middle words seen on the example
            trigrams[k].append(tokens[i+1])
            

trigrams_probabilities = {}
for k, words in trigrams.items():
    if len(set(words)) > 1:
        d = {}
        n = 0
        for word in words:
            if word not in d:
                d[word] = 0
            d[word] += 1
            n += 1
        for w, c in d.items():
            d[w] = float(c) / n
            
        trigrams_probabilities[k] = d
            
# normalize probabilities
# pi /= pi.sum()
# M /= M.sum(axis=1, keepdims=True)

In [21]:
trigrams_probabilities

{('purchased', 'unit'): {'this': 0.5, 'the': 0.5},
 ('modem', 'and'): {'router': 0.6666666666666666, 'phone': 0.3333333333333333},
 ('and', 'monitor'): {'lcd': 0.5, 'the': 0.5},
 ('for', 'this'): {'minute': 0.0625,
  'le': 0.0625,
  'hour': 0.0625,
  'feedback': 0.0625,
  'compatibility': 0.0625,
  'security': 0.0625,
  'sure': 0.125,
  'spec': 0.0625,
  'organization': 0.0625,
  'more': 0.0625,
  'the': 0.0625,
  'traveling': 0.0625,
  'use': 0.0625,
  'amazon': 0.0625,
  'vonage': 0.0625},
 ('this', 'than'): {'more': 0.6666666666666666, 'device': 0.3333333333333333},
 ('than', 'time'): {'enough': 0.3333333333333333,
  'four': 0.3333333333333333,
  'expected': 0.3333333333333333},
 ('enough', 'save'): {'time': 0.5, 'space': 0.5},
 ('time', 'work'): {'save': 0.5, 'for': 0.5},
 ('save', 'and'): {'work': 0.5, 'some': 0.5},
 ('and', 'down'): {'shut': 0.3333333333333333,
  'transient': 0.3333333333333333,
  'cool': 0.3333333333333333},
 ('important', 'that'): {'know': 0.5, 'remember': 0.5}

In [9]:
M[:2], pi[:1]

(array([[7.04046861e-05, 2.76127179e-02, 3.36111972e-02, 4.38621195e-02,
         4.22428117e-04, 8.73018108e-03, 2.05018446e-02, 1.08704835e-02,
         4.75090822e-02, 2.95699682e-04, 1.45456082e-02, 1.10633924e-01,
         2.53034442e-02, 2.09721479e-01, 2.11214058e-04, 2.34306795e-02,
         7.04046861e-05, 1.08395055e-01, 9.77076234e-02, 1.42217466e-01,
         7.92756766e-03, 2.33602749e-02, 1.04198935e-02, 4.50589991e-04,
         2.94854826e-02, 2.63313526e-03],
        [6.00292826e-02, 2.66089503e-02, 6.36577758e-05, 5.72919982e-04,
         2.54949392e-01, 6.36577758e-05, 1.27315552e-04, 4.45604431e-04,
         3.79400344e-02, 5.47456872e-03, 6.36577758e-05, 1.20567827e-01,
         8.91208861e-04, 1.27315552e-04, 1.52333057e-01, 6.36577758e-05,
         6.36577758e-05, 5.88197848e-02, 1.89063594e-02, 7.25698644e-03,
         1.67483608e-01, 8.27551085e-04, 1.27315552e-04, 6.36577758e-05,
         8.60653129e-02, 6.36577758e-05]]), array([0.10945403]))

In [85]:
# Random sample from probs
def random_sample(probs_dict, size):
    return np.random.choice(list(probs_dict.keys()), size, list(probs_dict.values()))[0]

## test spinner

In [105]:
def my_tokenizer(sentence:str):
#     s = sentence.lower()
#     s = regex.sub(' ', s) 
    tokens = nltk.tokenize.word_tokenize(sentence)
#     tokens = [t for t in tokens if len(t) > 2]
#     tokens = [lemmatizer.lemmatize(t) for t in tokens]
#     tokens = [t for t in tokens if t not in stopwords]

    return tokens


def test_spinner(reviews):
    review = np.random.choice(reviews)
    print("Original:\n ", review)
    tokens = my_tokenizer(review)
    for i, token in enumerate(tokens[:-2]):
        if random.random() < 0.2:
            k = (tokens[i], tokens[i+2])
            if k in trigrams_probabilities:
                w = random_sample(trigrams_probabilities[k], 1)
                print(f"we will substitute '{tokens[i+1]}' --> {w}!")
                tokens[i+1] = "***" + w + "***"
                

    print("spun: ")
    try: 
        print(" ".join(tokens))
    except:
        print(tokens)

In [110]:
t = "Works great, good quality from Belkin as usual. Hard to beat for the money"
rs = [r.text for r in positive_reviews]
test_spinner(rs)
# np.random.choice([r.text for r in positive_reviews])

Original:
  
I researched cases upon receipt of my 20 GB iPod and deemed this the best.  This case consists of a thin plastic sleeve inside a separate clear, hard shell.  The hard case protects the iPod from scratches and minor falls while still allowing easy access to the controls and inputs.  It also does not obscure the screen.  An enclosed plastic belt clip attaches to the back of the unit.  
This case is not compatible with a Bose Sounddock as the user must remove the iPod from the case.  Also, Contour Design only enclosed one of the thin plastic sleeves.  I spent $5 at their website for 5 more sleeves and gave some to friends with the same case.  
The price seems high for a couple pieces of plastic but a case is essential and this is the best one for the 20 GB iPod. 

we will substitute 'is' --> hit!
we will substitute 'enclosed' --> required!
we will substitute 'website' --> office!
we will substitute '5' --> about!
we will substitute 'gave' --> listened!
we will substitute 'sam

In [81]:
tokens

['the',
 'gps',
 'work',
 'both',
 'dell',
 'axim',
 'and',
 'compaq',
 'with',
 'compact',
 'flash',
 'adapter',
 'card',
 'have',
 'used',
 'with',
 'street',
 'and',
 'trip',
 'and',
 'arcpad']