In [1]:
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
from scipy.sparse import csr_matrix
from collections import Counter

In [2]:
def fit(dataset):
    if isinstance(dataset,(list)):
        unique_words = set()
        for row in dataset:
            for word in row.split(' '):
                if len(word)<2:
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {i:j for j,i in enumerate(unique_words)}
        return vocab
    else:
        return 'Dataset must be an instance of list.'

In [3]:
def transform(dataset, vocab):
    if isinstance(dataset,(list)):
        rows = []
        cols = []
        vals = []
        for idx, row in enumerate(tqdm(dataset)):
            word_freq = dict(Counter(row.split()))
            for word,freq in word_freq.items():
                if len(word)<2:
                    continue
                col_idx = vocab.get(word,-1)
                if col_idx != -1:
                    cols.append(col_idx)
                    rows.append(idx)
                    vals.append(freq)
        return csr_matrix((vals,(rows,cols)),shape=(len(dataset),len(vocab)))
    else:
        return 'Dataset must be an instance of list.'

In [4]:
strings = ["the method of lagrange multipliers is the economists workhorse for solving optimization problems",
           "the technique is a centerpiece of economic theory but unfortunately its usually taught poorly"]
vocab = fit(strings)
print(list(vocab.keys()))
print(transform(strings, vocab).toarray())

['but', 'centerpiece', 'economic', 'economists', 'for', 'is', 'its', 'lagrange', 'method', 'multipliers', 'of', 'optimization', 'poorly', 'problems', 'solving', 'taught', 'technique', 'the', 'theory', 'unfortunately', 'usually', 'workhorse']


100%|██████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 2010.69it/s]


[[0 0 0 1 1 1 0 1 1 1 1 1 0 1 1 0 0 2 0 0 0 1]
 [1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 1 1 1 1 1 0]]


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(analyzer = 'word')
vec.fit(strings)
feature_matrix_2 = vec.transform(strings)
print(feature_matrix_2.toarray())

[[0 0 0 1 1 1 0 1 1 1 1 1 0 1 1 0 0 2 0 0 0 1]
 [1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 1 1 1 1 1 0]]
