In [0]:
import string
import re, nltk, spacy, gensim
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import pyLDAvis
import pyLDAvis.sklearn

import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None
tr = str.maketrans("", "", string.punctuation)
stemmer = SnowballStemmer("english")
nltk.download('stopwords')

In [0]:
#load customized stopwords 
stopword_c = pd.read_csv('../Dataset/results/stopword.csv', encoding = "ISO-8859-1")
stopword_c = stopword_c[stopword_c['stopword'] == 1]
stopword_c = stopword_c['term'].tolist()

# load nltk stopwords
stoplist = stopwords.words('english')
stoplist.extend(stopword_c)
stoplist.extend(['be', 'have', 'not', 'do', 'so', 'when', 'would', 'that', 'can', 'more'])


In [0]:
# load classfied dataset
df = pd.read_csv('../Dataset/results/esre_5000_sgd_clf_result.csv')
# dataframe for emotional and non-emotional eating
df_post_esre = df[df['esre'] == 1].reset_index(drop = True, inplace = False)
df_post_nesre = df[df['esre'] == 0].reset_index(drop = True, inplace = False)

In [0]:
# preprocess sentences in posts(remove puctuation marks, stemming)
def preprocess_posts(df):
    nltk.download('punkt')
    df['unpunct_body'] = df.selftext.apply(lambda x: x.translate(tr))
    df['tokenized_body'] = df.unpunct_body.apply(lambda x: nltk.word_tokenize(x))
    df['lower_and_tokenized_body'] = df.tokenized_body.apply(lambda x: [y.lower() for y in x])
    df['stemmed_text'] = df.lower_and_tokenized_body.apply(lambda x: [stemmer.stem(y) for y in x])
    df['document'] = df.stemmed_text.map(lambda x: ' '.join([y for y in x]))
    return df

# stopword removal 
def remove_stopwords(wordlist, stopwords):
    return [w for w in wordlist if w not in stopwords]

# tokenize preprocessed sentences(by preprocess_posts() function) 
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

# lemmatizing tokenzed words(noun, adjective, verb and adverb only)
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

nlp = spacy.load('en', disable=['parser', 'ner'])

def preprocess_document(df):
    data_words = list(sent_to_words(df['document'].tolist()))
    data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    print(data_lemmatized[:2])
    return data_lemmatized

In [0]:
# preprocess posts (emotional/non-emotional)
topic_post_esre = preprocess_posts(df_post_esre)
post_esre = preprocess_document(topic_post_esre)
del topic_post_esre
# topic_post_nesre = preprocess_posts(df_post_nesre)
# post_nesre = preprocess_document(topic_post_nesre)
# del topic_post_nesre

In [0]:
# count-vectorize posts based on term frequency
def vectorize_document(data_lemmatized):
    vectorizer = CountVectorizer(stop_words=stoplist, min_df = 10, ngram_range =(1, 3))
    data_vectorized = vectorizer.fit_transform(data_lemmatized)
    return vectorizer, data_vectorized

In [0]:
vectorizer_post_esre,data_vectorized_post_esre = vectorize_document(post_esre)
# vectorizer_post_nesre,data_vectorized_post_nesre = vectorize_document(post_nesre)

In [0]:
# build LDA model
def buildLDA(data_vectorized, i):
    # Build LDA Model
    lda_model = LatentDirichletAllocation(n_components=i,               # Number of topics
                                          max_iter=10,               # Max learning iterations
                                          learning_method='online',   
                                          random_state=100,          # Random state
                                          batch_size=128,            # n docs in each learning iter
                                          evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                          n_jobs = 1               # Use all available CPUs
                                         )
    lda_output = lda_model.fit_transform(data_vectorized)

    print(lda_model)  # Model attributes
    print("Log Likelihood: ", lda_model.score(data_vectorized))
    print("Perplexity: ", lda_model.perplexity(data_vectorized))
    print(lda_model.get_params())
    return lda_model, lda_output

# label documents(posts) with their topic
def get_document_topic(best_lda_model, data_vectorized, data):
    # Create Document - Topic Matrix
    lda_output = best_lda_model.transform(data_vectorized)

    # column names
    topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]

    # index names
    docnames = ["Doc" + str(i) for i in range(len(data))]

    # Make the pandas dataframe
    df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

    # Get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1)
    df_document_topic['dominant_topic'] = dominant_topic
    df_document_topic['id'] = df_post_esre.id.tolist()
    df_document_topic['title'] = df_post_esre.title.tolist()
    df_document_topic['selftext'] = df_post_esre.selftext.tolist()
    

    # Styling
    def color_green(val):
        color = 'green' if val > .1 else 'black'
        return 'color: {col}'.format(col=color)

    def make_bold(val):
        weight = 700 if val > .1 else 400
        return 'font-weight: {weight}'.format(weight=weight)

    # Apply Style
    df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
    print(df_document_topics)
    return df_document_topic

# label documents(posts) with their topic(for non-emotional eating)
def get_document_topic_nesre(best_lda_model, data_vectorized, data):
    # Create Document - Topic Matrix
    lda_output = best_lda_model.transform(data_vectorized)

    # column names
    topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]

    # index names
    docnames = ["Doc" + str(i) for i in range(len(data))]

    # Make the pandas dataframe
    df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

    # Get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1)
    df_document_topic['dominant_topic'] = dominant_topic
    df_document_topic['id'] = df_post_nesre.id.tolist()
    df_document_topic['title'] = df_post_nesre.title.tolist()
    df_document_topic['selftext'] = df_post_nesre.selftext.tolist()
    

    # Styling
    def color_green(val):
        color = 'green' if val > .1 else 'black'
        return 'color: {col}'.format(col=color)

    def make_bold(val):
        weight = 700 if val > .1 else 400
        return 'font-weight: {weight}'.format(weight=weight)

    # Apply Style
    df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
    print(df_document_topics)
    return df_document_topic

# get distribution data of topics 
def get_df_topic_distribution(df_document_topic):
    df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
    df_topic_distribution.columns = ['Topic Num', 'Num Documents']
    print(df_topic_distribution)
    return df_topic_distribution

# visualize 
def get_visualization(best_lda_model, data_vectorized, vectorizer):
    pyLDAvis.enable_notebook()
    panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne', R=50)
    pyLDAvis.save_html(panel, '../Dataset/results/lda/lda_' + str(number_of_topics) + '.html')
        
    #     print(panel)
    return panel

# visualize (for non-emotional eating)
def get_visualization_nesre(best_lda_model, data_vectorized, vectorizer):
    pyLDAvis.enable_notebook()
    panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne', R=50)
    pyLDAvis.save_html(panel, '../Dataset/results/lda/lda_' + str(number_of_topics) + '.html')
        
    #     print(panel)
    return panel


# Show top n keywords for each topic
def show_topics(vectorizer, lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

def get_topic_keywords_matrix(vectorizer, best_lda_model):
    topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=100)        

    # Topic - Keywords Dataframe
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
    print(df_topic_keywords.head())
    return df_topic_keywords

In [0]:
def lda_model_nesre(data_vectorized, number_of_topics, df):
        lda_model_post_nesre, lda_output_post_nesre = buildLDA(data_vectorized, number_of_topics)
        df_document_topic_post_nesre = get_document_topic_nesre(lda_model_post_nesre, data_vectorized, df)
        df_topic_distribution_post_nesre = get_df_topic_distribution(df_document_topic_post_nesre)
        get_visualization_nesre(lda_model_post_nesre, data_vectorized, vectorizer_post_nesre)
        topic_keyword_matrix_post_nesre = get_topic_keywords_matrix(vectorizer_post_nesre, lda_model_post_nesre)

        print(df_document_topic_post_nesre.head())
        df_document_topic_post_nesre.to_csv('../Dataset/results/lda/df_document_topic_' + str(number_of_topics) + '.csv')
        df_topic_distribution_post_nesre.to_csv('../Dataset/results/lda/df_topic_distribution' + str(number_of_topics) + '.csv')
        topic_keyword_matrix_post_nesre.to_csv('../Dataset/results/lda/topic_keyword_matrix' + str(number_of_topics) + '.csv')

In [0]:
# build lda model and get topic modeling results
for number_of_topics  in range(4, 6):
    lda_model_esre(data_vectorized_post_esre, number_of_topics, post_esre)

In [0]:
df = pd.read_csv('../Dataset/results/lda/post_esre/df_document_topic_post_esre_4.csv')

In [0]:
df[:200].to_csv('../Dataset/results/lda/post_esre/df_document_topic_post_esre_4_200.csv')

In [0]:
df = pd.read_csv('../Dataset/results/lda/post_esre/df_document_topic_post_esre_4.csv')[0:200]
del df['Unnamed: 0']

In [0]:
df.to_csv('../Dataset/results/lda/post_esre/df_document_topic_post_esre_4_200.csv', index = False)