In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, MiniBatchNMF
from sklearn.pipeline import Pipeline
from bs4 import BeautifulSoup
from tqdm import tqdm

import sqlite3 as sql
import pandas as pd
import numpy as np
import joblib
import time

tqdm.pandas()
SEED = 101

In [14]:
def strip_html(html):
    return BeautifulSoup(html, features="lxml").get_text()

Load the questions dataset and create a corpus.

In [5]:
df = pd.read_csv('StackSample/Questions.csv', encoding="ISO-8859-1")
df.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


I probably need to strip HTML from the Body field to create a usable corpus. I'll use BeautifulSoup for this and the lxml parser.

In [11]:
test = df['Body'].values[0]
print(test)

<p>I've written a database generation script in <a href="http://en.wikipedia.org/wiki/SQL">SQL</a> and want to execute it in my <a href="http://en.wikipedia.org/wiki/Adobe_Integrated_Runtime">Adobe AIR</a> application:</p>

<pre><code>Create Table tRole (
      roleID integer Primary Key
      ,roleName varchar(40)
);
Create Table tFile (
    fileID integer Primary Key
    ,fileName varchar(50)
    ,fileDescription varchar(500)
    ,thumbnailID integer
    ,fileFormatID integer
    ,categoryID integer
    ,isFavorite boolean
    ,dateAdded date
    ,globalAccessCount integer
    ,lastAccessTime date
    ,downloadComplete boolean
    ,isNew boolean
    ,isSpotlight boolean
    ,duration varchar(30)
);
Create Table tCategory (
    categoryID integer Primary Key
    ,categoryName varchar(50)
    ,parent_categoryID integer
);
...
</code></pre>

<p>I execute this in Adobe AIR using the following methods:</p>

<pre><code>public static function RunSqlFromFile(fileName:String):void {
    var f

In [15]:
print(strip_html(test))

I've written a database generation script in SQL and want to execute it in my Adobe AIR application:
Create Table tRole (
      roleID integer Primary Key
      ,roleName varchar(40)
);
Create Table tFile (
    fileID integer Primary Key
    ,fileName varchar(50)
    ,fileDescription varchar(500)
    ,thumbnailID integer
    ,fileFormatID integer
    ,categoryID integer
    ,isFavorite boolean
    ,dateAdded date
    ,globalAccessCount integer
    ,lastAccessTime date
    ,downloadComplete boolean
    ,isNew boolean
    ,isSpotlight boolean
    ,duration varchar(30)
);
Create Table tCategory (
    categoryID integer Primary Key
    ,categoryName varchar(50)
    ,parent_categoryID integer
);
...

I execute this in Adobe AIR using the following methods:
public static function RunSqlFromFile(fileName:String):void {
    var file:File = File.applicationDirectory.resolvePath(fileName);
    var stream:FileStream = new FileStream();
    stream.open(file, FileMode.READ)
    var strSql:String = 

Now that we have that working, let's preprocess the text (in-place to save memory). Note that I'd like to show how we can do this as a generator, both with all text in memory and all text on disk.

In [20]:
df['Body'] = df['Body'].progress_map(strip_html)
df.head()

100%|███████████████████████████████| 1264216/1264216 [06:25<00:00, 3282.80it/s]


Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,I've written a database generation script in S...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,Are there any really good tutorials explaining...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,Has anyone got experience creating SQL-based A...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,This is something I've pseudo-solved many time...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,I have a little game written in C#. It uses a ...


We have enough data to do a Train/Val/Test split. I am going to go ahead and create that split, then persist the prep-processed dataframe to SQLite so we don't need to run this all again.

In [44]:
rng = np.random.RandomState(SEED)
df['Split'] = rng.choice([0,1,2], size=df.shape[0], replace=True, p=[0.7, 0.1, 0.2])
df.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body,Split
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,I've written a database generation script in S...,0
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,Are there any really good tutorials explaining...,0
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,Has anyone got experience creating SQL-based A...,0
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,This is something I've pseudo-solved many time...,0
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,I have a little game written in C#. It uses a ...,0


In [45]:
df.groupby('Split')['Score'].agg(['count', 'mean', 'median', 'min', 'max'])

Unnamed: 0_level_0,count,mean,median,min,max
Split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,885496,1.77956,0.0,-73,5190
1,126826,1.782623,0.0,-23,1513
2,251894,1.787943,0.0,-18,2537


In [3]:
with sql.connect('StackSample.db') as conn:
#     df.to_sql('df', conn, index=False)
    df = pd.read_sql_query('select * from df', conn)

Now let's train an NMF model. I'll also benchmark the time it takes to run this training. Remember, we are training from a dataset in memory. I am running an Intel i5-1135G7 X8 with 32GB of RAM.

In [4]:
vect = TfidfVectorizer(
    min_df=2,
    max_df=0.95,
    token_pattern=r'\b[a-z]+\b',
    stop_words='english'
)

nmf = NMF(
    n_components=100,
    init='random',
    random_state=SEED
)

In [5]:
idx = df['Split'] == 0
X_train = df.loc[idx, 'Body'].values

In [6]:
start = time.time()
X_train_vect = vect.fit_transform(X_train)
end = time.time()
print(f'Time to train vectorizer: {end-start:0.2f}s')

Time to train vectorizer: 60.09s


In [57]:
start = time.time()
X_train_nmf = nmf.fit_transform(X_train_vect)
end = time.time()
print(f'Time to train NMF w/ {nmf.n_components} components: {end-start:02f}s')

Time to train NMF w/ 100 components: 3320.237246s


In [60]:
joblib.dump(nmf, 'nmf_100.joblib')

['nmf_100.joblib']

Maybe it would be best to cycle through some different topic numbers and compare NMF to MiniBatchNMF.

In [None]:
for n_components in [10, 20, 50, 100, 200, 500]:
    nmf = NMF(
        n_components=n_components,
        init='random',
        random_state=SEED
    )
    start = time.time()
    nmf.fit(X_train_vect)
    end = time.time()
    print(f'Time to train NMF w/ {n_components} components: {end-start:02f}s')
    joblib.dump(nmf, f'nmf_{n_components}_random.joblib')



Time to train NMF w/ 10 components: 209.307184s




Time to train NMF w/ 20 components: 438.920101s
Time to train NMF w/ 50 components: 1401.294100s
Time to train NMF w/ 100 components: 3220.459530s




Time to train NMF w/ 200 components: 14288.799218s


In [8]:
for n_components in [10, 20, 50, 100, 200]:
    nmf = MiniBatchNMF(
        n_components=n_components,
        init='random',
        verbose=False,
        random_state=SEED
    )
    start = time.time()
    nmf.fit(X_train_vect)
    end = time.time()
    print(f'Time to train online NMF w/ {n_components} components: {end-start:02f}s')
    joblib.dump(nmf, f'online_nmf_{n_components}_random.joblib')

Time to train online NMF w/ 10 components: 232.140755s
Time to train online NMF w/ 20 components: 660.861073s
Time to train online NMF w/ 50 components: 2660.552445s
Time to train online NMF w/ 100 components: 10.644401s
Time to train online NMF w/ 200 components: 20.515010s


In [10]:
X_nmf = nmf.transform(X_train_vect)

In [11]:
X_nmf.shape

(885496, 200)

In [15]:
nmf = MiniBatchNMF(
    n_components=200,
    init='nndsvda',
    verbose=False,
    random_state=SEED
)
start = time.time()
nmf.fit(X_train_vect)
end = time.time()
print(f'Time to train online NMF w/ {n_components} components: {end-start:02f}s')

Time to train online NMF w/ 200 components: 13021.849455s


In [16]:
joblib.dump(nmf, f'online_nmf_{n_components}_nndsvda.joblib')

['online_nmf_200_nndsvda.joblib']