In [1]:
import os
import shutil
import re
import time
import logging

import nltk
from nltk.stem import WordNetLemmatizer

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

import scipy.sparse as sparse

In [2]:
class LemmaTokenizer:
    _token_pattern = re.compile(r"(?u)\b\w\w+\b")

    def __init__(self):
        self.wnl = WordNetLemmatizer()

    @property
    def token_pattern(self):
        return self._token_pattern

    @token_pattern.setter
    def token_pattern(self, s):
        if isinstance(s, str):
            self._token_pattern = re.compile(s)
        elif isinstance(s, type(re.compile(""))):
            self._token_pattern = s
        else:
            raise AttributeError

    def __call__(self, doc):
        return [
            self.wnl.lemmatize(t) for t in doc.split() 
            if (len(t) >= 2 and 
                re.match("[a-z].*", t) and 
                re.match(self.token_pattern, t))
        ]

In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jinma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
website = "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip"

In [5]:
from datasets import load_dataset

In [6]:
data = load_dataset(
    path="./wlda/wikitext.py", name="wikitext-103-v1", cache_dir="data")

Reusing dataset wikitext (data\wikitext\wikitext-103-v1\1.0.0\8ae2a41908b3b12285d41e5b92b82eb1837e7053db277a34d471f19c5e0888af)


In [7]:
docs = data["train"]["text"]

In [8]:
vectorizer=CountVectorizer(
    input='content', analyzer='word', stop_words='english',
    tokenizer=LemmaTokenizer(),
    max_df=0.8, min_df=3, max_features=20000)

In [9]:
%%time
vectorizer.fit(docs)

  'stop_words.' % sorted(inconsistent))


Wall time: 6min 57s


CountVectorizer(max_df=0.8, max_features=20000, min_df=3, stop_words='english',
                tokenizer=<__main__.LemmaTokenizer object at 0x000001D9933F5668>)

In [10]:
vectorizer.transform([docs[0]])

<1x20000 sparse matrix of type '<class 'numpy.int64'>'
	with 743 stored elements in Compressed Sparse Row format>

In [11]:
vectorizer2=CountVectorizer(
    input='content', analyzer='word', stop_words='english',
    tokenizer=LemmaTokenizer(),
    max_df=0.8, min_df=3, max_features=20000)

In [12]:
vectorizer2.transform([docs[0]])

NotFittedError: Vocabulary not fitted or provided

In [8]:
import array

In [9]:
from scipy.sparse import csr_matrix

In [10]:
from collections import defaultdict

In [11]:
import numpy as np

In [22]:
voca, indptr, indices = [], [], []
values = array.array(str("i")) # signed integer
values

array('i')

In [12]:
import pickle

In [87]:
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"), protocol=pickle.HIGHEST_PROTOCOL)

In [29]:
tokenizer = LemmaTokenizer()

In [28]:
len(tokenizer(docs[0]))

2535

In [13]:
max_features = 20000

In [14]:
from collections import Counter

In [15]:
vectorizer2=CountVectorizer(
    input='content', analyzer='word', stop_words='english',
    tokenizer=LemmaTokenizer(),
    max_df=0.8, min_df=3, max_features=20000)

1. `validate_params`

In [16]:
vectorizer2.ngram_range # min_n, max_m

(1, 1)

2. `validate_vocabulary`

3. setting vocab

In [17]:
vectorizer2.max_df, vectorizer2.min_df

(0.8, 3)

In [18]:
vocabulary = defaultdict()
vocabulary.default_factory = vocabulary.__len__
vocabulary

defaultdict(<method-wrapper '__len__' of collections.defaultdict object at 0x000001FB74AFB7C8>,
            {})

In [19]:
vectorizer2.stop_words

'english'

In [21]:
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS

In [22]:
analzer = "word"
# http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
stop_words = ENGLISH_STOP_WORDS
tokenize = LemmaTokenizer()

In [23]:
strip_accents = None

In [24]:
from functools import partial


def _preprocess(doc, accent_function, lower):
    if lower:
        doc = doc.lower()
    if accent_function is not None:
        doc = accent_function(doc)
    return doc


preprocess = partial(_preprocess, accent_function=strip_accents, lower=True)

In [25]:
inconsistent = set()
for w in stop_words:
    tokens = list(tokenize(preprocess(w)))
    for token in tokens:
        if token not in stop_words:
            inconsistent.add(token)
_stop_words_id = id(stop_words)

In [26]:
not inconsistent

False

In [27]:
def _analyze(doc, analyzer=None, tokenizer=None, ngrams=None,
             preprocessor=None, decoder=None, stop_words=None):
    if decoder is not None:
        doc = decoder(doc)
    if analyzer is not None:
        doc = analyzer(doc)
    else:
        if preprocessor is not None:
            doc = preprocessor(doc)
        if tokenizer is not None:
            doc = tokenizer(doc)
        if ngrams is not None:
            if stop_words is not None:
                doc = ngrams(doc, stop_words)
            else:
                doc = ngrams(doc)
    return doc

In [30]:
tokenizer("i like you")

['like', 'you']

In [32]:
def _word_ngrams(tokens, stop_words=None, ngram_range=(1,1)):
    if stop_words is not None:
        tokens = [w for w in tokens if w not in stop_words]
    # handle token n-grams
    min_n, max_n = ngram_range
    if max_n != 1:
        original_tokens = tokens
        if min_n == 1:
            tokens = list(original_tokens)
            min_n += 1
        else:
            tokens = []
        n_original_tokens = len(original_tokens)
        
        # bind method outside of loop to reduce overhead
        tokens_append = tokens.append
        space_join = " ".join
        
        for n in range(min_n, min(max_n+1, n_original_tokens+1)):
            for i in range(n_original_tokens-n+1):
                tokens_append(space_join(original_tokens[i:i+n]))
    return tokens

In [33]:
tokens = tokenizer("my favorite food is sea food")
print(tokens)
_word_ngrams(tokens, stop_words=["is"], ngram_range=(1,5))

['my', 'favorite', 'food', 'is', 'sea', 'food']


['my',
 'favorite',
 'food',
 'sea',
 'food',
 'my favorite',
 'favorite food',
 'food sea',
 'sea food',
 'my favorite food',
 'favorite food sea',
 'food sea food',
 'my favorite food sea',
 'favorite food sea food',
 'my favorite food sea food']

In [34]:
vectorizer2.decode_error

'strict'

In [35]:
def decode(doc, input_type="content"):
    if input_type == "filename":
        with open(doc, "rb") as fh:
            doc = fh.read()
    elif input_type == "file":
        doc = doc.read()
    
    if isinstance(doc, bytes):
        doc = doc.decode("utf-8", "strict")
    
    if doc is np.nan:
        raise ValueError
        
    return doc

In [36]:
tokenize

<__main__.LemmaTokenizer at 0x1fb9856a550>

In [37]:
analyze = partial(_analyze, ngrams=_word_ngrams,
                  tokenizer=tokenize, preprocessor=preprocess,
                  decoder=decode, stop_words=stop_words)

In [38]:
vectorizer2._validate_vocabulary()

In [39]:
vectorizer2.fixed_vocabulary_

False

In [40]:
def _make_int_array():
    return array.array(str("i"))

In [41]:
_v = defaultdict()

In [42]:
_v.default_factory = _v.__len__

In [43]:
_v["i"]

0

In [44]:
%%time
j_indices = []
indptr = []
values = _make_int_array()
indptr.append(0)
for doc in docs:
    feature_counter = {}
    for feature in analyze(doc):
        try:
            feature_idx = vocabulary[feature]
            if feature_idx not in feature_counter:
                feature_counter[feature_idx] = 1
            else:
                feature_counter[feature_idx] += 1
        except KeyError:
            print("ERROR")
            continue
    j_indices.extend(feature_counter.keys())
    values.extend(feature_counter.values())
    indptr.append(len(j_indices))

Wall time: 6min 50s


In [45]:
vocabulary = dict(vocabulary)

In [50]:
indptr[-1] > np.iinfo(np.int32).max

False

In [51]:
indices_dtype = np.int32

In [52]:
len(vocabulary)

203446

In [53]:
j_indices = np.asarray(j_indices, dtype=indices_dtype)
indptr = np.asarray(indptr, dtype=indices_dtype)
values = np.frombuffer(values, dtype=np.intc)

In [54]:
values.shape, j_indices.shape, indptr.shape

((19320318,), (19320318,), (29445,))

In [55]:
import scipy.sparse as sp

In [56]:
X = sp.csr_matrix((values, j_indices, indptr),
              shape=(len(indptr)-1, len(vocabulary)),
              dtype=np.int64)

In [58]:
import copy
_j_indices = copy.deepcopy(X.indices)

In [59]:
X.sort_indices()

In [60]:
X.indices.shape

(19320318,)

In [61]:
np.equal(_j_indices, X.indices).all()

False

In [62]:
len(vocabulary)

203446

In [63]:
X

<29444x203446 sparse matrix of type '<class 'numpy.int64'>'
	with 19320318 stored elements in Compressed Sparse Row format>

In [64]:
vectorizer2.binary

False

In [65]:
n_doc = X.shape[0]
n_doc

29444

In [66]:
max_df = vectorizer2.max_df
min_df = vectorizer2.min_df
max_df, min_df

(0.8, 3)

In [67]:
import numbers

In [68]:
isinstance(min_df, numbers.Integral), isinstance(max_df, numbers.Integral)

(True, False)

In [69]:
max_doc_count = max_df * n_doc
min_doc_count = min_df
max_doc_count, min_doc_count

(23555.2, 3)

In [70]:
X.indices

array([     0,      1,      2, ..., 113659, 130858, 195014])

In [71]:
# _sort_features
sorted_features = sorted(vocabulary.items())
map_index = np.empty(len(sorted_features), dtype=X.indices.dtype)
for new_val, (term, old_val) in enumerate(sorted_features):
    vocabulary[term] = new_val
    map_index[old_val] = new_val
X.indices = map_index.take(X.indices, mode="clip")

In [72]:
X.indices

array([190852,  32715,  83216, ...,  25047, 128116,  10166])

In [73]:
def _limit_features(self, X, vocabulary, high=None, low=None,
                    limit=None):
    """Remove too rare or too common features.
    Prune features that are non zero in more samples than high or less
    documents than low, modifying the vocabulary, and restricting it to
    at most the limit most frequent.
    This does not prune samples with zero features.
    """
    if high is None and low is None and limit is None:
        return X, set()

    # Calculate a mask based on document frequencies
    dfs = _document_frequency(X)
    mask = np.ones(len(dfs), dtype=bool)
    if high is not None:
        mask &= dfs <= high
    if low is not None:
        mask &= dfs >= low
    if limit is not None and mask.sum() > limit:
        tfs = np.asarray(X.sum(axis=0)).ravel()
        mask_inds = (-tfs[mask]).argsort()[:limit]
        new_mask = np.zeros(len(dfs), dtype=bool)
        new_mask[np.where(mask)[0][mask_inds]] = True
        mask = new_mask

    new_indices = np.cumsum(mask) - 1  # maps old indices to new
    removed_terms = set()
    for term, old_index in list(vocabulary.items()):
        if mask[old_index]:
            vocabulary[term] = new_indices[old_index]
        else:
            del vocabulary[term]
            removed_terms.add(term)
    kept_indices = np.where(mask)[0]
    if len(kept_indices) == 0:
        raise ValueError("After pruning, no terms remain. Try a lower"
                         " min_df or a higher max_df.")
    return X[:, kept_indices], removed_terms

In [74]:
# Calculate a mask based on document frequencies
if sp.isspmatrix_csr(X):
    dfs = np.bincount(X.indices, minlength=X.shape[1])
else:
    dfs = np.diff(X.indptr)
dfs

array([  9,   4, 122, ...,   3,   2,   9], dtype=int64)

In [75]:
mask = np.ones(len(dfs), dtype=bool)
mask

array([ True,  True,  True, ...,  True,  True,  True])

In [76]:
mask &= dfs <= max_doc_count
mask &= dfs >= min_doc_count
mask

array([ True,  True,  True, ...,  True, False,  True])

In [77]:
mask.sum() > max_features

True

In [79]:
tfs = np.asarray(X.sum(axis=0)).ravel()
tfs

array([  9,   4, 290, ...,   3,   3,  14], dtype=int64)

In [80]:
tfs.shape

(203446,)

In [81]:
(-tfs).argsort()

array([194715, 181522, 200865, ..., 163407,  66830,  24708], dtype=int64)

In [82]:
mask_inds = (-tfs[mask]).argsort()[:max_features]
mask_inds

array([147178,  54761,  90653, ...,  60786,  62603, 119810], dtype=int64)

In [83]:
new_mask = np.zeros(len(dfs), dtype=bool)
new_mask[np.where(mask)[0][mask_inds]] = True
mask = new_mask
mask

array([False, False,  True, ..., False, False, False])

In [84]:
mask.sum()

20000

In [85]:
new_indices = np.cumsum(mask) - 1  # maps old indices to new
removed_terms = set()
for term, old_index in list(vocabulary.items()):
    if mask[old_index]:
        vocabulary[term] = new_indices[old_index]
    else:
        del vocabulary[term]
        removed_terms.add(term)
len(removed_terms)

183446

In [86]:
kept_indices = np.where(mask)[0]
kept_indices

array([     2,     26,     51, ..., 203376, 203378, 203440], dtype=int64)

In [87]:
if len(kept_indices) == 0:
    raise ValueError("After pruning, no terms remain. Try a lower"
                     " min_df or a higher max_df.")

In [282]:
X = X[:, kept_indices]
stop_words_ = removed_terms