--Download library--

In [None]:
!pip install corextopic

--import dependencies--

In [1]:
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import en_core_web_sm
# import en_core_web_lg

from pprint import pprint

## Setup NLP Pipeline and Data Cleaning 

In [2]:
##define nlp and start cleaning data
nlp= en_core_web_sm.load()

# List of stop words to equalize data
stop_list = ["Depression","depression",  "anxiety", "Anxiety"]

# Updates spaCy's default stop words list with my additional words. 
nlp.Defaults.stop_words.update(stop_list)

# Iterates over the words in the stop words list and resets the "is_stop" flag.
for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True


In [3]:
#clean data

def lemmatizer(doc):
    # This takes in a doc of tokens from the NER and lemmatizes them. 
    # Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)
    
def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

# The add_pipe function appends our functions to the default pipeline.
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

## import depression data

In [4]:
doc_dep = pd.read_csv(r'Datasets/2020_March_r_Depression.csv')
doc_dep = doc_dep[doc_dep.Body != '[removed]']
doc_dep = doc_dep.sample(n=7154) 

In [12]:
#create a list of documents (list of lists) 
text_doc = doc_dep['Body'].to_list()
type(text_doc)

list

In [6]:
doc_list = []
# Iterates through each article in the corpus.
for doc in text_doc:
    # Passes that article through the pipeline and adds to a new list.
    pr = nlp(str(doc))
    doc_list.append(pr)

In [7]:
#However doc_list gives us each post as a list, with individual words being elements 
from nltk.tokenize.treebank import TreebankWordDetokenizer
empt = [];
for doc in doc_list:
    a1 = TreebankWordDetokenizer().detokenize(doc)
    empt.append(a1)
    
df_doc_dep = pd.DataFrame(empt,columns = ['Body'])
#df_doc_dep is a dataframe that has cleaned posts from Depression subreddit.
#all the 'removed' posts are gone and all the stopwords in the individual posts are gone! 

#both doc_list and df_doc are important (at least I think so :D)


df_doc_dep['label'] = 0

In [8]:
df_doc_dep

Unnamed: 0,Body,label
0,Indica edible Wednesday feel amazing usual tim...,0
1,yes thing undoubtedly scary find way watch bre...,0
2,want mother tell minute ago want know thing wa...,0
3,night struggle sleep wish wake tired feel like...,0
4,morning start terrible leak unknown origin car...,0
...,...,...
7149,know tired live \n\n feel merely sad idea lin...,0
7150,hate \n\n long buckle usual story normal chil...,0
7151,use hear people life like story create honestl...,0
7152,feel useless want feel useless pointless ughhh...,0


--read in data--

## anxietydataset

In [None]:

doc_anx = pd.read_csv(r'2020_March_r_Anxiety.csv')
doc_anx = doc_anx[doc_anx.Body != '[removed]']

text_doc_anx = doc_anx['Body'].tolist()

In [None]:
doc_list_anx = []
# Iterates through each article in the corpus.
for doc in text_doc_anx:
    # Passes that article through the pipeline and adds to a new list.
    pr = nlp(str(doc))
    doc_list_anx.append(pr)

## from tokens create document list again

In [None]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
empt = [];
for doc in doc_list_anx:
    a1 = TreebankWordDetokenizer().detokenize(doc)
    empt.append(a1)
    
df_doc_anx = pd.DataFrame(empt,columns = ['Body'])
#df_doc_anx is a dataframe that has cleaned posts from Anxiety subreddit
#all the 'removed' posts are gone and all the stopwords in the individual posts are gone! 

#both doc_list and df_doc are important (at least I think so :D)

df_doc_anx['label']=1
df_doc_anx

## Merge Depression and Anxiety subreddits 

In [None]:
df_col_merged =pd.concat([df_doc_dep, df_doc_anx], axis=0).reset_index(drop=True)
df_col_merged

## Obtain TF-IDF score for combined dataframes

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    max_df=.2,
    min_df=8,
    max_features=None,
    ngram_range=(1, 1),
    norm=None,
    binary=True,
    use_idf=False,
    sublinear_tf=False
)

vectorizer = vectorizer.fit(df_col_merged.Body)
tfidf = vectorizer.transform(df_col_merged.Body)
vocab = vectorizer.get_feature_names()
print(len(vocab))

## Machine Learning

In [None]:
X = tfidf;
y = df_col_merged.label.tolist()

import numpy as np
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train, y_train)


## clf.score provides the accuracy of machine learning model

In [None]:
clf.score(X_test,y_test)

## Topic Modelling

In [None]:
#read in data
doc = pd.read_csv(r'Datasets/2020_March_r_Depression.csv')
doc = doc[doc.Body != '[removed]']

In [None]:
#create a list of documents (list of lists) 
text_doc = doc['Body'].tolist()

In [None]:
nlp= spacy.load("en")

# List of stop words to equalize data
stop_list = ["Depression","depression", "coronavirus", "quarantine", "coronavirus", "Coronavirus", "lockdown", "anxiety", "Anxiety", "Quarantine", "Lockdown", "Agoraphobia", "Agoraphobic", "agoraphobic", "agoraphobia"]

# Updates spaCy's default stop words list with my additional words. 
nlp.Defaults.stop_words.update(stop_list)

# Iterates over the words in the stop words list and resets the "is_stop" flag.
for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

In [None]:
def lemmatizer(doc):
    # This takes in a doc of tokens from the NER and lemmatizes them. 
    # Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)
    
def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

# The add_pipe function appends our functions to the default pipeline.
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)



In [None]:
doc_list = []
# Iterates through each article in the corpus.
for doc in text_doc:
    # Passes that article through the pipeline and adds to a new list.
    pr = nlp(str(doc))
    doc_list.append(pr)

In [None]:
#however doc_list gives us each post as a list, with individual words being elements 
from nltk.tokenize.treebank import TreebankWordDetokenizer
empt = [];
for doc in doc_list:
    a1 = TreebankWordDetokenizer().detokenize(doc)
    empt.append(a1)
    
df_doc = pd.DataFrame(empt,columns = ['Body'])
#df_doc is a dataframe that has cleaned posts. 
#all the 'removed' posts are gone and all the stopwords in the individual posts are gone! 

#both doc_list and df_doc are important (at least I think so :D)


df_doc

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    max_df=.5,
    min_df=10,
    max_features=None,
    ngram_range=(1, 2),
    norm=None,
    binary=True,
    use_idf=False,
    sublinear_tf=False
)
vectorizer = vectorizer.fit(df_doc.Body)
tfidf = vectorizer.transform(df_doc.Body)
vocab = vectorizer.get_feature_names()
print(len(vocab))

In [None]:
#import corextopic

from corextopic import corextopic as ct
anchors = []
model = ct.Corex(n_hidden=6, seed=42)
model = model.fit(
    tfidf,
    words=vocab
)

In [None]:
for i, topic_ngrams in enumerate(model.get_topics(n_words=15)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

In [None]:
# Anchors designed to nudge the model towards measuring specific genres
anchors = [
    ["family"],
    ["die"],
    [""],
    [""],
    [""],
    [""],

]
anchors = [
    [a for a in topic if a in vocab]
    for topic in anchors
]

model = ct.Corex(n_hidden=8, seed=42)
model = model.fit(
    tfidf,
    words=vocab,
    anchors=anchors, # Pass the anchors in here
    anchor_strength=3 # Tell the model how much it should rely on the anchors
)

In [None]:
for i, topic_ngrams in enumerate(model.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

In [None]:
topic_df = pd.DataFrame(
    model.transform(tfidf), 
    columns=["topic_{}".format(i+1) for i in range(6)]
).astype(float)
topic_df.iloc[0:10]

In [None]:
topic_df = pd.DataFrame(
    model.transform(tfidf), 
    columns=["topic_{}".format(i+1) for i in range(6)]
).astype(float)
topic_df

## LDA Unsupervised

In [None]:
# Creates, which is a mapping of word IDs to words.
words = corpora.Dictionary(doc_list)

# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in doc_list]

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=6, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
#print the keyword in the 10 topics
pprint(lda_model.print_topics(num_words=100))

In [None]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=doc_list):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=doc_list)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(100)

In [None]:
df_dominant_topic.head(25)
#df_dominant_topic.to_csv(r'depression_2020_datest.csv')

In [None]:
from collections import Counter
Counter(df_dominant_topic.Dominant_Topic)

In [None]:
newest_doc = newest_doc[newest_doc.Body != '[removed]'].dropna()

In [None]:
newest_doc.Body

In [None]:
topic_df = pd.DataFrame(
    model.transform(tfidf), 
    columns=["topic_{}".format(i+1) for i in range(6)]
).astype(float)

df = pd.concat([df, topic_df], axis=1)

In [None]:
topic_df