# Import Libraries

In [10]:
import array
from scipy.sparse import csr_matrix
from collections import defaultdict
import numpy as np

import os
import shutil
import re
import time
import logging

import nltk
from nltk.stem import WordNetLemmatizer

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

from functools import partial

# Load Datasets

In [1]:
from datasets import load_dataset


data = load_dataset(
    path="./wlda/wikitext.py", name="wikitext-103-v1", cache_dir="data")

Reusing dataset wikitext (data\wikitext\wikitext-103-v1\1.0.0\8ae2a41908b3b12285d41e5b92b82eb1837e7053db277a34d471f19c5e0888af)


In [25]:
docs = data["train"]["text"][:1000]

# Common Analyzer

In [9]:
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS

In [8]:
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.token_pattern = re.compile(r"(?u)\b\w\w+\b")

    def __call__(self, doc):
        return [
            self.wnl.lemmatize(t) for t in doc.split() 
            if (len(t) >= 2 and 
                re.match("[a-z].*", t) and 
                re.match(self.token_pattern, t))
        ]

In [11]:
analzer = "word"
# http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
stop_words = ENGLISH_STOP_WORDS
tokenize = LemmaTokenizer()

strip_accents = None

In [12]:
def _preprocess(doc, accent_function, lower):
    if lower:
        doc = doc.lower()
    if accent_function is not None:
        doc = accent_function(doc)
    return doc


preprocess = partial(_preprocess, accent_function=strip_accents, lower=True)

In [13]:
inconsistent = set()
for w in stop_words:
    tokens = list(tokenize(preprocess(w)))
    for token in tokens:
        if token not in stop_words:
            inconsistent.add(token)
_stop_words_id = id(stop_words)

def _analyze(doc, analyzer=None, tokenizer=None, ngrams=None,
             preprocessor=None, decoder=None, stop_words=None):
    if decoder is not None:
        doc = decoder(doc)
    if analyzer is not None:
        doc = analyzer(doc)
    else:
        if preprocessor is not None:
            doc = preprocessor(doc)
        if tokenizer is not None:
            doc = tokenizer(doc)
        if ngrams is not None:
            if stop_words is not None:
                doc = ngrams(doc, stop_words)
            else:
                doc = ngrams(doc)
    return doc

In [14]:
def _word_ngrams(tokens, stop_words=None, ngram_range=(1,1)):
    if stop_words is not None:
        tokens = [w for w in tokens if w not in stop_words]
    # handle token n-grams
    min_n, max_n = ngram_range
    if max_n != 1:
        original_tokens = tokens
        if min_n == 1:
            tokens = list(original_tokens)
            min_n += 1
        else:
            tokens = []
        n_original_tokens = len(original_tokens)
        
        # bind method outside of loop to reduce overhead
        tokens_append = tokens.append
        space_join = " ".join
        
        for n in range(min_n, min(max_n+1, n_original_tokens+1)):
            for i in range(n_original_tokens-n+1):
                tokens_append(space_join(original_tokens[i:i+n]))
    return tokens

In [15]:
def decode(doc, input_type="content"):
    if input_type == "filename":
        with open(doc, "rb") as fh:
            doc = fh.read()
    elif input_type == "file":
        doc = doc.read()
    
    if isinstance(doc, bytes):
        doc = doc.decode("utf-8", "strict")
    
    if doc is np.nan:
        raise ValueError
        
    return doc

In [16]:
analyze = partial(_analyze, ngrams=_word_ngrams,
                  tokenizer=tokenize, preprocessor=preprocess,
                  decoder=decode, stop_words=stop_words)

# Scikit-Learn's CounterVectorizer

In [17]:
vocabulary = defaultdict()
vocabulary.default_factory = vocabulary.__len__

In [19]:
def _make_int_array():
    return array.array(str("i"))

In [20]:
j_indices = []
indptr = []
values = _make_int_array()
indptr.append(0)

In [43]:
%%time
vocabs = [w for doc in docs for w in analyze(doc)]

Wall time: 12.1 s


In [44]:
len(vocabs)

1397765

In [27]:
%%time
for doc in docs:
    feature_counter = {}
    for feature in analyze(doc):
        try:
            feature_idx = vocabulary[feature]
            if feature_idx not in feature_counter:
                feature_counter[feature_idx] = 1
            else:
                feature_counter[feature_idx] += 1
        except KeyError:
            print("ERROR")
            continue
    j_indices.extend(feature_counter.keys())
    values.extend(feature_counter.values())
    indptr.append(len(j_indices))

Wall time: 12.7 s


In [28]:
vocabulary = dict(vocabulary)
len(vocabulary)

59667

In [31]:
j_indices = np.asarray(j_indices, dtype=np.int32)
indptr = np.asarray(indptr, dtype=np.int32)
values = np.frombuffer(values, dtype=np.intc)

values.shape, j_indices.shape, indptr.shape

((606377,), (606377,), (1001,))

In [32]:
X = csr_matrix((values, j_indices, indptr),
               shape=(len(indptr)-1, len(vocabulary)),
               dtype=np.int64)

In [34]:
X.todense()

matrix([[54, 39, 15, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        ...,
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  1, ...,  0,  0,  0],
        [ 0,  2,  4, ...,  2,  1,  1]], dtype=int64)

# Torch's CountVectorizer

In [36]:
import torch

In [35]:
values = array.array(str("i"))
j_indices = []
indptr = []

In [40]:
analyzed_docs = [analyze(doc) for doc in docs]

In [41]:
for doc in analyzed_docs

['valkyria',
 'chronicle',
 'iii',
 'senjō',
 'valkyria',
 'chronicle',
 'japanese',
 'lit',
 'valkyria',
 'battlefield',
 'commonly',
 'referred',
 'valkyria',
 'chronicle',
 'iii',
 'outside',
 'japan',
 'tactical',
 'role',
 'playing',
 'video',
 'game',
 'developed',
 'sega',
 'media.vision',
 'playstation',
 'portable',
 'released',
 'january',
 'japan',
 'game',
 'valkyria',
 'series',
 'employing',
 'fusion',
 'tactical',
 'real',
 'time',
 'gameplay',
 'predecessor',
 'story',
 'run',
 'parallel',
 'game',
 'follows',
 'nameless',
 'penal',
 'military',
 'unit',
 'serving',
 'nation',
 'gallia',
 'second',
 'europan',
 'war',
 'perform',
 'secret',
 'black',
 'operation',
 'pitted',
 'imperial',
 'unit',
 'raven',
 'game',
 'began',
 'development',
 'carrying',
 'large',
 'portion',
 'work',
 'valkyria',
 'chronicle',
 'ii',
 'retained',
 'standard',
 'feature',
 'series',
 'underwent',
 'multiple',
 'adjustment',
 'making',
 'game',
 'forgiving',
 'series',
 'newcomer',
 'char

In [38]:
vocabulary

{'valkyria': 0,
 'chronicle': 1,
 'iii': 2,
 'senjō': 3,
 'japanese': 4,
 'lit': 5,
 'battlefield': 6,
 'commonly': 7,
 'referred': 8,
 'outside': 9,
 'japan': 10,
 'tactical': 11,
 'role': 12,
 'playing': 13,
 'video': 14,
 'game': 15,
 'developed': 16,
 'sega': 17,
 'media.vision': 18,
 'playstation': 19,
 'portable': 20,
 'released': 21,
 'january': 22,
 'series': 23,
 'employing': 24,
 'fusion': 25,
 'real': 26,
 'time': 27,
 'gameplay': 28,
 'predecessor': 29,
 'story': 30,
 'run': 31,
 'parallel': 32,
 'follows': 33,
 'nameless': 34,
 'penal': 35,
 'military': 36,
 'unit': 37,
 'serving': 38,
 'nation': 39,
 'gallia': 40,
 'second': 41,
 'europan': 42,
 'war': 43,
 'perform': 44,
 'secret': 45,
 'black': 46,
 'operation': 47,
 'pitted': 48,
 'imperial': 49,
 'raven': 50,
 'began': 51,
 'development': 52,
 'carrying': 53,
 'large': 54,
 'portion': 55,
 'work': 56,
 'ii': 57,
 'retained': 58,
 'standard': 59,
 'feature': 60,
 'underwent': 61,
 'multiple': 62,
 'adjustment': 63,
 'm

In [13]:
t = torch.LongTensor(3, 12).random_(2, 20)
t

tensor([[14, 10, 11, 14,  9, 15,  5, 13, 13, 13, 16, 11],
        [ 9,  7, 12,  2,  3,  5,  6,  3,  2,  2, 12, 13],
        [ 3, 17,  4,  4,  7,  8, 17, 10, 18, 16, 11, 10]])

In [15]:
torch.bincount(t[0])

tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 2, 0, 3, 2, 1, 1])

In [8]:
values.extend(
    [1,5,1,4,3,2,5,6,3,2,7,8,1])
j_indices.extend(
    [2,5,0,1,2,4,5,1,3,0,3,5,0])
indptr.extend([0,2,7,9,10,12,13])

In [9]:
X = csr_matrix(
    (values, j_indices, indptr),
    shape=(len(indptr)-1, len(j_indices)),
    dtype=np.int64)

ValueError: Last value of index pointer should be less than the size of index and data arrays

In [None]:
X.todense()