# 1 Test a Python/NumPy Implementation of LDA

### 1.1 Load samples

In [None]:
from sklearn.datasets import fetch_20newsgroups

samples, _ = fetch_20newsgroups(
    remove=('headers', 'footers', 'quotes'),
    shuffle=True,
    random_state=1,
    return_X_y=True
)

### 1.2 Tokenize samples

In [None]:
import re

pattern = re.compile(r'\b[a-z]+\b')
samples = [pattern.findall(s.lower()) for s in samples]

### 1.3 Filter stopwords

In [None]:
with open("stopwords.txt", "r") as f:
    stopword_list = set(f.read().splitlines())
    
samples = [[w for w in s if w not in stopword_list] for s in samples]

### 1.4 Use only a subset

In [None]:
N_SAMPLES = 2000
subset = samples[:N_SAMPLES]

### 1.5 Import and run Python LDA

In [None]:
import cProfile
import pstats
from pstats import SortKey
from py_lda import PythonLDA

py_lda = PythonLDA(corpus=subset, T=20, S=100, beta=0.01, alpha=0.1)
cProfile.runctx('py_lda.fit()', globals(), locals(), filename="py_stats.txt")
py_lda.print_topics()
pstats.Stats('py_stats.txt').strip_dirs().sort_stats(SortKey.TIME).print_stats(10)

# 2 Test a Cython Implementation of LDA w/CGS

### 3.1 Prepare notebook for Cython

In [3]:
!python3 cy_setup.py build_ext --inplace --force

Compiling cy_lda.pyx because it changed.
[1/1] Cythonizing cy_lda.pyx
running build_ext
building 'cy_lda' extension
creating build
creating build/temp.linux-x86_64-3.8
x86_64-linux-gnu-gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -g -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -fPIC -I/home/joshua/repos/cython-lda/venv/include -I/usr/include/python3.8 -c cy_lda.c -o build/temp.linux-x86_64-3.8/cy_lda.o
x86_64-linux-gnu-gcc -pthread -shared -Wl,-O1 -Wl,-Bsymbolic-functions -Wl,-Bsymbolic-functions -Wl,-z,relro -g -fwrapv -O2 -Wl,-Bsymbolic-functions -Wl,-z,relro -g -fwrapv -O2 -g -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 build/temp.linux-x86_64-3.8/cy_lda.o -o /home/joshua/repos/cython-lda/cy_lda.cpython-38-x86_64-linux-gnu.so


### 3.2 Repeat steps 1.1-1.4, load Cython extension, and test LDA

In [4]:
%%cython

# 1.1 Load samples
from sklearn.datasets import fetch_20newsgroups

samples, _ = fetch_20newsgroups(
    remove=('headers', 'footers', 'quotes'),
    shuffle=True,
    random_state=1,
    return_X_y=True
)

# 1.2 Tokenize samples
import re

pattern = re.compile(r'\b[a-z]+\b')
samples = [pattern.findall(s.lower()) for s in samples]

# 1.3 Filter stopwords
with open("stopwords.txt", "r") as f:
    stopword_list = set(f.read().splitlines())
samples = [[w for w in s if w not in stopword_list] for s in samples]

# 1.4 Use only a subset
N_SAMPLES = 2000
subset = samples[:N_SAMPLES]

# Test LDA
from cy_lda import CythonLDA
import cProfile
import pstats
from pstats import SortKey

cy_lda = CythonLDA(corpus=subset, T=20, S=100, beta=0.01, alpha=0.1)
cProfile.runctx('cy_lda.fit()', globals(), locals(), filename="cy_stats.txt")
pstats.Stats('cy_stats.txt').strip_dirs().sort_stats(SortKey.TIME).print_stats(10)


Iteration 0: -1435782.903896
Iteration 10: -1376416.068089
Iteration 20: -1353688.998181
Iteration 30: -1339322.551705
Iteration 40: -1329561.610376
Iteration 50: -1323316.220194
Iteration 60: -1318461.470116
Iteration 70: -1314677.727184
Iteration 80: -1311703.591040
Iteration 90: -1309066.440714
Iteration 100: -1306312.818619

Fri Jul 17 17:28:26 2020    cy_stats.txt

         119 function calls in 3.542 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      101    3.471    0.034    3.471    0.034 cy_lda.pyx:103(_sample_topics)
       11    0.070    0.006    0.070    0.006 cy_lda.pyx:75(_log_prob)
        1    0.001    0.001    3.542    3.542 cy_lda.pyx:152(inference_loop)
        1    0.000    0.000    3.542    3.542 <string>:1(<module>)
        1    0.000    0.000    3.542    3.542 {built-in method builtins.exec}
        1    0.000    0.000    3.542    3.542 {method 'fit' of 'cy_lda.CythonLDA' objects}
        1    0.00