<a href="https://colab.research.google.com/github/kleczekr/tolkenizer/blob/master/vectorization_again.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download('punkt')
import string

In [None]:
# def describe(self, fileids = None, categories = None):
#   counts = nltk.FreqDist()
#   tokens = nltk.FreqDist()

#   for para in self.paras(fileids, categories):
#     counts['paras'] += 1

#     for sent in para:
#       counts['sents'] += 1

#       for word, tag in sent:
#         counts['words'] += 1
#         tokens[word] += 1

#   return {
#       'paras':  counts['paras'],
#       'sents':  counts['sents'],
#       'words':  counts['words'],
#       'vocab':  len(tokens),
#       'lexdiv': float(counts['words']) / float(len(tokens)),
#       'sspar':  float(counts['sents']) / float(counts['paras'])
#   }

In [None]:
def tokenize(text):
  stem = nltk.stem.SnowballStemmer('english')
  text = text.lower()

  for token in nltk.word_tokenize(text):
    if token in string.punctuation: continue
    yield stem.stem(token)

In [None]:
corpus = [
          'The elephant potatoes saw sneeze sneeze sample sentence full stop boom.',
          'Bats echolocation see see bat sneeze see!',
          'This is another sample sentence but might be different from the sentence that is in the book.'
]

### Frequency vectorization

In [None]:
# NLTK solution:
from collections import defaultdict

def vectorize(doc):
  # defaultdict(int) returns 0 for a key that has not been assigned yet
  features = defaultdict(int)
  for token in tokenize(doc):
    features[token] += 1
  return features

vectors = map(vectorize, corpus)

In [None]:
# Scikit-Learn solution:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(corpus)

In [None]:
# gensim solution:
import gensim

corpus = [tokenize(doc) for doc in corpus]
id2word = gensim.corpora.Dictionary(corpus)
vectors = [
           # gensim freq encoder is doc2bow; takes single doc instance
           id2word.doc2bow(doc) for doc in corpus
]

### One-hot encoding

In [None]:
# NLTK solution:
def vectorize(doc):
  return {
      token: True
      for token in doc
  }

vectors = map(vectorize, corpus)

In [None]:
# Scikit-Learn solution:
from sklearn.preprocessing import Binarizer

freq = CountVectorizer()
corpus = freq.fit_transform(corpus)

# Binarizer converts frequency values to 1
onehot = Binarizer()
corpus = onehot.fit_transform(corpus.toarray())

In [None]:
# gensim solution:
corpus = [tokenize(doc) for doc in corpus]
id2word = gensim.corpora.Dictionary(corpus)
vectors = [
           [(token[0], 1) for token in id2word.doc2bow(doc)]
           for doc in corpus
]

### TF-IDF

In [None]:
# NLTK solution:
from nltk.text import TextCollection

def vectorize(corpus):
  corpus = [tokenize(doc) for doc in corpus]
  texts = TextCollection(corpus)

  for doc in corpus:
    yield {
        term: texts.tf_idf(term, doc)
        for term in doc
    }

In [None]:
# Scikit-Learn solution:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
corpus = tfidf.fit_transform(corpus)

In [None]:
# gensim solution
corpus = [tokenize(doc) for doc in corpus]
lexicon = gensim.corpora.Dictionary(corpus)
tfidf = gensim.models.TfidfModel(dictionary = lexicon, normalize = True)
vectors = [tfidf[lexicon.doc2bow(doc)] for doc in corpus]

### Distributed representation

In [None]:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec

corpus = [list(tokenize(doc)) for doc in corpus]
corpus = [
          TaggedDocument(words, ['d{}'.format(idx)])
          for idx, words in enumerate(corpus)
]

model = Doc2Vec(corpus, size = 5, min_count = 0)
print(model.docvecs[0])

### Pipeline?