In [22]:
# 使用三連詞 trigrams 練習簡易文件產生器
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range
import nltk
import random
import numpy as np
nltk.download('punkt')

from bs4 import BeautifulSoup

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
# load the reviews
positive_reviews = BeautifulSoup(open('./positive.review', encoding='ISO-8859-1').read(), "lxml")
positive_reviews = positive_reviews.findAll('review_text')

In [29]:
# 提出 三連詞 並置入字典
# (w1, w3) 當作 key, [ w2 ] 當作值
pentagrams = {}
for review in positive_reviews:
    s = review.text.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    for i in range(len(tokens) - 4):
        k = (tokens[i], tokens[i+1], tokens[i+3], tokens[i+4])
        if k not in pentagrams:
            pentagrams[k] = []
        pentagrams[k].append(tokens[i+2])

In [30]:
pentagrams

{('i', 'purchased', 'unit', 'due'): ['this'],
 ('purchased', 'this', 'due', 'to'): ['unit'],
 ('this', 'unit', 'to', 'frequent'): ['due'],
 ('unit', 'due', 'frequent', 'blackouts'): ['to'],
 ('due', 'to', 'blackouts', 'in'): ['frequent'],
 ('to', 'frequent', 'in', 'my'): ['blackouts'],
 ('frequent', 'blackouts', 'my', 'area'): ['in'],
 ('blackouts', 'in', 'area', 'and'): ['my'],
 ('in', 'my', 'and', '2'): ['area'],
 ('my', 'area', '2', 'power'): ['and'],
 ('area', 'and', 'power', 'supplies'): ['2'],
 ('and', '2', 'supplies', 'going'): ['power'],
 ('2', 'power', 'going', 'bad'): ['supplies'],
 ('power', 'supplies', 'bad', '.'): ['going'],
 ('supplies', 'going', '.', 'it'): ['bad'],
 ('going', 'bad', 'it', 'will'): ['.'],
 ('bad', '.', 'will', 'run'): ['it'],
 ('.', 'it', 'run', 'my'): ['will'],
 ('it', 'will', 'my', 'cable'): ['run'],
 ('will', 'run', 'cable', 'modem'): ['my'],
 ('run', 'my', 'modem', ','): ['cable'],
 ('my', 'cable', ',', 'router'): ['modem'],
 ('cable', 'modem', 'rout

In [31]:
# 將中間字矩陣變成或然率向量
pentagram_probabilities = {}
for k, words in iteritems(pentagrams):
  # generate word -> count dict
  if len(set(words)) > 1:
    # if the middle word not has only one probability
    d = {}
    n = 0
    for w in words:
      if w not in d:
        d[w] = 0
      d[w] += 1
      n += 1
    for w, c in iteritems(d):
      d[w] = float(c) / n
    pentagram_probabilities[k] = d

In [33]:
pentagram_probabilities

{('i', "'ve", 'them', 'for'): {'had': 0.5, 'used': 0.5},
 ("'ve", 'had', 'for', 'about'): {'it': 0.6666666666666666,
  'them': 0.3333333333333333},
 ('is', "n't", 'easy', 'to'): {'always': 0.5, 'that': 0.5},
 ("'s", 'a', 'case', 'that'): {'great': 0.3333333333333333,
  'rubber': 0.3333333333333333,
  'sturdy': 0.3333333333333333},
 ('.', 'this', 'has', 'a'): {'one': 0.5, 'tv': 0.5},
 ('.', 'and', 'the', 'price'): {'...': 0.5, 'at': 0.5},
 ('value', 'for', 'money', '.'): {'the': 0.3333333333333333,
  'your': 0.6666666666666666},
 ('i', 'like', 'very', 'much'): {'it': 0.6666666666666666,
  'them': 0.3333333333333333},
 ('like', 'it', 'much', ','): {'so': 0.3333333333333333,
  'very': 0.6666666666666666},
 (',', 'i', 'it', 'in'): {'put': 0.3333333333333333,
  'use': 0.6666666666666666},
 ('with', 'this', 'and', 'the'): {'brand': 0.5, 'card': 0.5},
 ('through', 'the', ',', 'i'): {'computer': 0.3333333333333333,
  'reviews': 0.3333333333333333,
  'stereo': 0.3333333333333333},
 ('do', "n't"

In [32]:
def random_sample(d):
  # 從字典隨機選出一個帶機率值的樣本，回傳累積機率值最大的字
  r = random.random()
  cumulative = 0
  for w, p in iteritems(d):
    cumulative += p 
    if r < cumulative:
      return w

In [38]:
def test_spinner():
  review= random.choice(positive_reviews)
  s = review.text.lower()
  print("Original:", s)
  tokens = nltk.tokenize.word_tokenize(s)
  for i in range(len(tokens) - 4):
    if random.random() < 0.2: # 20% chance of replacement
      k = (tokens[i], tokens[i + 1], tokens[i+3], tokens[i+4])
      if k in pentagram_probabilities:
        w = random_sample(pentagram_probabilities[k])
        tokens[i + 2] = w
  print("Spun:")
  print(" ".join(tokens).replace(" .", ".").replace(" '", "'").replace(" ,", ",").replace("$ ", "$").replace(" !", "!"))

if __name__ == '__main__':
  test_spinner()
  

Original: 
i have owned many sony cameras in the past and am currently using two right now. the only problem i have encountered with any digital camera is that when you think you have enough memory, you run out when you need it most. that is why i upgraded all of my cameras from 256 and 512 meg memory sticks to the sony 1 gig. actually i carry a spare 1 gig too where ever i go. 

look, when you take video with your digital camera or you use a high file size (3-6 megapixels) per picture, which you should always do to get the best possible prints made, your camera will need a ton of storage space.  especially if you are on vacation or encounter a great photo moment or maybe even something you hadn't expected like a wildfire, a plane crash, a bank robbery or maybe even a cat in the park having kittens. you can always "resize" you photos to a smaller size later when you save then to your computer if you want to save space. trust me, take large megapixel pics now and you will be amazed at h