In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
import nltk
import numpy as np
import pandas as pd
import random

In [40]:
pet_analysis = pd.read_csv('pet_analysis.csv')
pet_analysis.dropna(subset=['Description'], inplace=True) #removing blank entries
wpt=nltk.WordPunctTokenizer()
stop_words=nltk.corpus.stopwords.words('english')
def normalize_document (doc):
    #lowercase and remove special characters\whitespace
    doc=re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A) #re.I ignore case sensitive, ASCII-only matching
    doc=doc.lower()
    doc=doc.strip()
    #tokenize document
    tokens=wpt.tokenize(doc)
    #filter stopwords out of document
    filtered_tokens=[token for token in tokens if token not in stop_words]
    #re-create documenr from filtered tokens
    doc=' '.join(filtered_tokens)
    return doc
normalize_corpus=np.vectorize(normalize_document)
norm_corpus=normalize_corpus(pet_analysis['Description'])
norm_corpus

array(['nibble month old ball cuteness energetic playful rescued couple cats months ago could get neutered time clinic fully scheduled result little kitty enough space funds care cats household looking responsible people take nibbles care',
       'found alone yesterday near apartment shaking bring home provide temporary care',
       'pregnant mother dumped irresponsible owner roadside near shops subang jaya gave birth roadside healthy adorable puppies already dewormed vaccinated ready go home tying caging long hours guard dogs however acceptable cage tie precautionary purposes interested adopt pls call',
       ..., 'mix breed good temperament kittens love humans friendly',
       'shyadventures independentshe hates cagesbut loves climbing trees rooftopshowever loving',
       'fili loves laying around also loves sun laidback quiet'],
      dtype='<U4018')

In [48]:
text = str(pet_analysis['Description'])
n = 3
ngrams = {} #declare the n-grams, which is a dictionary
# Creating the model, first we have to loop through the whole text, e.g. Glo - lob - oba - ...
for i in range(len(text)-n):#-n because there won't be nothing for the last 3 characters
    gram = text[i:i+n] #our n-gram, first iteration text [0:3] = Glo
    if gram not in ngrams.keys(): #if not in the dictionary, we add it
        ngrams[gram] = [] #initialize the list
    ngrams[gram].append(text[i+n]) #we append the chracter following the n-gram, text[0+3]=b
ngrams

{'0  ': [' ', ' ', ' ', ' ', ' ', ' '],
 '   ': [' ',
  ' ',
  ' ',
  ' ',
  ' ',
  'N',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  'I',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  'T',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  'G',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  'T',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  'T',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  'a',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  'S',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  'h',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  'V',
  ' ',
  ' ',
  ' ',
  ' ',
  'F',
  ' ',
  ' ',
  ' ',
  ' ',
  'K',
  ' ',
  ' ',
  ' ',
  ' ',
  'P',
  ' ',
  ' ',
  ' ',
  ' ',
  'H',
  ' ',
  ' ',
  ' ',
  ' ',
  'L',
  ' ',
  ' ',
  ' ',
  ' ',
  'W',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  't',
  ' ',
  ' ',
  ' ',
  ' ',
  's',
  ' ',
  ' ',
  ' ',
  ' ',
  'T',
  ' ',
  ' ',
  ' ',
  ' ',
  'B',
  ' ',
  ' ',
  ' ',
  ' ',
  'G',
  ' ',
  ' ',
  ' ',
  ' ',
  'T',
  ' ',
  ' ',
  ' ',
  ' ',
  'M',
  ' ',
 

In [49]:
#after successfully building the n-gram model we are going to build the small autocomplete application
# Testing our N-Gram Model    
currentGram = text[0:n] #start at a specific tri-gram, then we find the next character, here first three characters
result = currentGram
for i in range(100): #we generate a string with the length of 100, not the whole text.
    if currentGram not in ngrams.keys(): #if not in the dictionary list, we stop predicting
        break
    possibilities = ngrams[currentGram] #list of all characters that follows the specific n-gram
    nextItem = possibilities[random.randrange(len(possibilities))]#return a random element from the list
    result += nextItem
    currentGram = result[len(result)-n:len(result)] #new currentgram is the last three characters form the result

In [50]:
print(result)

0      Fili just was rescue..am hopings. n...
14965                Donut ...
14986         ...
14974   


In [51]:
# Order of the grams
n = 2
# Our N-Grams
ngrams = {}
words = nltk.word_tokenize(text)
for i in range(len(words)-n):
    gram = ' '.join(words[i:i+n])
    if gram not in ngrams.keys():
        ngrams[gram] = []
    ngrams[gram].append(words[i+n])# we construct a string out of words we get

In [52]:
ngrams

{'0 Nibble': ['is'],
 'Nibble is': ['a'],
 'is a': ['3+', 'stray', 'super', 'friendly', 'very', 'very', 'well'],
 'a 3+': ['month'],
 '3+ month': ['old'],
 'month old': ['ball', 'white'],
 'old ball': ['of'],
 'ball of': ['cuteness'],
 'of cuteness': ['.'],
 'cuteness .': ['He'],
 '. He': ['...'],
 'He ...': ['1'],
 '... 1': ['I'],
 '1 I': ['just'],
 'I just': ['found'],
 'just found': ['it'],
 'found it': ['alone'],
 'it alone': ['yesterday'],
 'alone yesterday': ['near'],
 'yesterday near': ['my'],
 'near my': ['apartm', 'office'],
 'my apartm': ['...'],
 'apartm ...': ['2'],
 '... 2': ['Their'],
 '2 Their': ['pregnant'],
 'Their pregnant': ['mother'],
 'pregnant mother': ['was'],
 'mother was': ['dumped'],
 'was dumped': ['by'],
 'dumped by': ['her', 'it'],
 'by her': ['irresp'],
 'her irresp': ['...'],
 'irresp ...': ['3'],
 '... 3': ['Good'],
 '3 Good': ['guard'],
 'Good guard': ['dog'],
 'guard dog': [','],
 'dog ,': ['very'],
 ', very': ['alert'],
 'very alert': [','],
 'alert ,

In [53]:
currentGram = ' '.join(words[0:n])
result = currentGram
for i in range(30):
    if currentGram not in ngrams.keys():
        break
    possibilities = ngrams[currentGram]
    nextItem = possibilities[random.randrange(len(possibilities))]
    result += ' '+nextItem
    rWords = nltk.word_tokenize(result)#last n words from the result
    currentGram = ' '.join(rWords[len(rWords)-n:len(rWords)])

In [55]:
print(result)

0 Nibble is a super playful kitten who is on the g ... 12 Peanut was an abused puppy until he was rescue ... 13 Hi Pet Lovers ! This is a
