In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, MiniBatchNMF
from sklearn.pipeline import Pipeline
from bs4 import BeautifulSoup
from tqdm import tqdm

import sqlite3 as sql
import pandas as pd
import numpy as np
import joblib
import time

tqdm.pandas()
DB = 'StackSample.db'
SEED = 101

In [25]:
def get_query(query, db=DB):
    with sql.connect(db) as conn:
        df = pd.read_sql_query(query, conn)
    return df

def get_text_generator(rowids, db=DB):
    with sql.connect(db) as conn:
        for rowid in rowids:
            query = f'select Body from df where rowid={rowid}'
            text = conn.execute(query).fetchall()[0][0] # First element of the first tuple in the response
            yield text
            
class Corpus():
    def __init__(self, rowids, db):
        self.rowids = rowids
        self.db = db
        
    def __iter__(self):
        with sql.connect(self.db) as conn:
            for rowid in self.rowids:
                query = f'select Body from df where rowid={rowid}'
                text = conn.execute(query).fetchall()[0][0] # First element of the first tuple in the response
                yield text
    
    def __len__(self):
        return len(self.rowids)

In [14]:
conn = sql.connect(DB)
rowids = [row[0] for row in conn.execute('select rowid from df where Split=0')]
len(rowids)

885496

In [23]:
corpus = get_text_generator(rowids)
type(corpus)

generator

In [26]:
corpus = Corpus(rowids, DB)

Now let's train an NMF model. I'll also benchmark the time it takes to run this training. Remember, we are training from a dataset in memory. I am running an Intel i5-1135G7 X8 with 32GB of RAM.

In [27]:
vect = TfidfVectorizer(
    min_df=2,
    max_df=0.95,
    token_pattern=r'\b[a-z]+\b',
    stop_words='english'
)

nmf = NMF(
    n_components=100,
    init='random',
    random_state=SEED
)

In [28]:
start = time.time()
X_train_vect = vect.fit_transform(corpus)
end = time.time()
print(f'Time to train vectorizer: {end-start:0.2f}s')

Time to train vectorizer: 81.17s


In [None]:
start = time.time()
X_train_nmf = nmf.fit_transform(X_train_vect)
end = time.time()
print(f'Time to train NMF w/ {nmf.n_components} components: {end-start:02f}s')

In [None]:
joblib.dump(nmf, 'nmf_100.joblib')

Maybe it would be best to cycle through some different topic numbers and compare NMF to MiniBatchNMF.

In [None]:
for n_topics in [10, 20, 50, 100, 200]:
    