In [1]:
# load data

import pandas as pd

posts = pd.read_csv('../dat/all_comments.csv')

In [2]:
posts.columns


Index(['Unnamed: 0', 'body', 'author', 'score', 'created_utc', 'id', 'post'], dtype='object')

In [3]:
# clean
# remove extra cols

posts = posts.drop(columns=['Unnamed: 0', 'author', 'score', 'created_utc', 
'id', 'post'])


In [4]:
# remove punct/lower case

import re
posts['post_clean'] = posts['body'].map(lambda x: re.sub('[,\.!?]', '', x))
posts['post_clean'] = posts['post_clean'].map(lambda x: x.lower())
posts['post_clean'].head()

0    i already smoke weed and do psychedelics but i...
1    i already tried everything (i was not an addic...
2    after the most recent episode i don’t even wan...
3    in recovery 🤗 sometimes this show makes me mis...
4    i‘m already a sad person i don‘t need drugs to...
Name: post_clean, dtype: object

In [5]:
# tokenize and clean more
import gensim
from gensim.utils import simple_preprocess

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data = posts.post_clean.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0])

['already', 'smoke', 'weed', 'and', 'do', 'psychedelics', 'but', 'never', 'ever', 'fuck', 'with', 'opiates']


In [6]:
# phrase modeling w/bigram and trigrams
from torch import threshold

bigram = gensim.models.Phrases(data_words, min_count = 5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold = 100)

In [7]:
# make models
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [8]:
# remove stopwords, bigrams and lemma
import nltk
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

In [9]:
def remove_stopwords(text):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words]
    for doc in text]

def make_bigrams(text):
    return [bigram_mod[doc] for doc in text]

def make_trigrams(text):
    return [trigram_mod[doc] for doc in text]

def lemmatization(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in text:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [10]:
import spacy
# pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
spacy.load('en_core_web_sm')

<spacy.lang.en.English at 0x7fd24a7e76a0>

In [11]:

# remove stopwords
data_words_nostop = remove_stopwords(data_words)

# bigrams
data_words_bigrams = make_bigrams(data_words_nostop)

# trigrams
# data_words_trigrams = make_trigrams(data_words_nostop)

# initialize spacy 'en' model
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# lemmatize but only keep noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams,
                                allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1][0])

['already', 'smoke_weed', 'psychedelic', 'never', 'ever', 'fuck', 'opiate']


In [12]:
# transform, corpus and dictionary
import gensim.corpora as corpora

# dictionary
id2word = corpora.Dictionary(data_lemmatized)

# corpus
texts = data_lemmatized

# tdf
corpus = [id2word.doc2bow(text) for text in texts]

print(corpus[:1][0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]


In [13]:
# base model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=4, 
                                       random_state=2022,
                                       chunksize=100,
                                       passes=100,
                                       per_word_topics=True)


In [14]:
from pprint import pprint

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.030*"drug" + 0.016*"make" + 0.014*"get" + 0.013*"show" + 0.012*"try" + '
  '0.012*"use" + 0.011*"never" + 0.011*"know" + 0.010*"want" + 0.010*"rue"'),
 (1,
  '0.012*"make" + 0.010*"rue" + 0.009*"dopamine" + 0.008*"laurie" + '
  '0.008*"see" + 0.008*"addict" + 0.007*"get" + 0.007*"want" + 0.007*"look" + '
  '0.007*"really"'),
 (2,
  '0.020*"withdrawal" + 0.016*"go" + 0.013*"get" + 0.013*"see" + 0.012*"feel" '
  '+ 0.010*"addict" + 0.009*"opiate" + 0.009*"trigger" + 0.009*"yawn" + '
  '0.008*"people"'),
 (3,
  '0.015*"get" + 0.012*"withdrawal" + 0.011*"thing" + 0.009*"time" + '
  '0.009*"bad" + 0.009*"much" + 0.008*"really" + 0.008*"feel" + 0.008*"way" + '
  '0.007*"think"')]


In [15]:
# model perplexity and coherence
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized,
                               dictionary=id2word, coherence='u_mass')

coherence_lda = coherence_model_lda.get_coherence()
print('coherence score: ', coherence_lda)


coherence score:  -2.466541133647462


In [None]:
# calculating coreherence with c_v took WAY too long

In [16]:
# sensitivity tests

# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='u_mass')
    
    return coherence_model_lda.get_coherence()

In [17]:
# iterate above function over range of topics

import numpy as np
import tqdm

grid = {}
grid['validation_set'] = {}

# topic rnge
min_topics = 2
max_topics= 10
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# alpha
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# beta
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# validation set
num_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_docs*.75)), corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

 89%|████████▉ | 480/540 [1:43:39<12:57, 12.96s/it]
