In [26]:
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_lg

from tqdm import tqdm_notebook as tqdm
from pprint import pprint

In [67]:
doc = pd.read_csv (r'test_agor.csv')
newest_doc = doc[['Body']] 
newest_doc.head()
newest_doc.Body.tolist()[4]

"Hey guys writing after a long time...... I would like to share my current life and agoraphobia condition..... My anxiety and panic attacks have faded away due to medication and staying restricted in my safe place........Even not having a panic attack since 4-5 months I am unable to go out anywhere I have made up mind that if I go out I will suffer though I am sure that everything will be perfectly but still I am unable to take a step I have got myself too comfortable in my safe. Place where every thing is according to me and under controlled conditions\n\nI don't think that the way I am living can work for longer some day or other I have to take steps but I am afraid and started to have suicidal thoughts\n\n( Sorry for my English )\n\nI woul like to add that I have dropped out of college the worst decision in my life I know I am going to regret it afterwards like 2-5 years from now .."

In [49]:
nlp= spacy.load("en")

# My list of stop words.
stop_list = ["Agoraphobia","Agoraphobic", "agoraphobia", "agoraphobic"]

# Updates spaCy's default stop words list with my additional words. 
nlp.Defaults.stop_words.update(stop_list)

# Iterates over the words in the stop words list and resets the "is_stop" flag.
for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

In [50]:
def lemmatizer(doc):
    # This takes in a doc of tokens from the NER and lemmatizes them. 
    # Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)
    
def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

# The add_pipe function appends our functions to the default pipeline.
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [69]:
doc_list = []
# Iterates through each article in the corpus.
for doc in newest_doc.Body.tolist():
    # Passes that article through the pipeline and adds to a new list.
    pr = nlp(str(doc))
    doc_list.append(pr)

In [70]:
doc_list

[['nan'],
 ['slowly',
  'fall',
  'start',
  'university',
  'feel',
  'like',
  'belong',
  'like',
  'trick',
  'way',
  'catch',
  'day',
  'feel',
  'like',
  'stare',
  'know',
  'true',
  'special',
  'past',
  'year',
  'struggle',
  'baseline',
  'depression',
  'strong',
  'forgot',
  'love',
  'cloud',
  'finally',
  'sunlight',
  'week',
  'plunge',
  'depressive',
  'episode',
  'strong',
  'pretty',
  'lie',
  'bed',
  'day',
  'day',
  'week',
  'Christmas',
  'break',
  'leave',
  'house',
  'stare',
  'like',
  'magnet',
  'hold',
  'bed',
  'pull',
  'away',
  'magnet',
  'frightened',
  'body',
  'heart',
  'suffer',
  'like',
  'sit',
  'living',
  'room',
  'anymore',
  'open',
  'live',
  'small',
  'apartment',
  'bed',
  'safe',
  '\n\n ',
  'night',
  'new',
  'year',
  'eve',
  'partner',
  'party',
  'follow',
  'far',
  'away',
  'denial',
  'possibly',
  'condition',
  'obviously',
  'whiner',
  'resolve',
  'parent',
  '\n\n ',
  'try',
  'leave',
  'stand'

In [71]:
# Creates, which is a mapping of word IDs to words.
words = corpora.Dictionary(doc_list)

# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in doc_list]

In [72]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=10, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [74]:
#print the keyword in the 10 topics
pprint(lda_model.print_topics(num_words=100))

[(0,
  '0.018*"\n'
  '\n'
  ' " + 0.013*"feel" + 0.013*"like" + 0.013*"people" + 0.012*"help" + '
  '0.011*"want" + 0.011*"time" + 0.011*"know" + 0.011*"anxiety" + '
  '0.009*"leave" + 0.008*"work" + 0.008*"\n'
  ' " + 0.007*"think" + 0.007*"house" + 0.007*"year" + 0.006*"need" + '
  '0.006*"start" + 0.006*"come" + 0.005*"school" + 0.005*"panic" + '
  '0.005*"talk" + 0.005*"try" + 0.005*"day" + 0.005*"good" + 0.004*"thing" + '
  '0.004*"stay" + 0.004*"outside" + 0.004*"friend" + 0.004*"attack" + '
  '0.004*"home" + 0.004*"store" + 0.003*"social" + 0.003*"fear" + '
  '0.003*"place" + 0.003*"right" + 0.003*"advice" + 0.003*"find" + '
  '0.003*"able" + 0.003*"thank" + 0.003*"job" + 0.003*"hard" + 0.003*"  " + '
  '0.003*"look" + 0.003*"grocery" + 0.003*"lot" + 0.003*"etc" + 0.003*"guy" + '
  '0.003*"use" + 0.003*"boyfriend" + 0.003*"live" + 0.003*"issue" + '
  '0.003*"way" + 0.003*"experience" + 0.003*"understand" + 0.003*"walk" + '
  '0.003*"family" + 0.003*"mental" + 0.003*"class" + 0.0

In [78]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=doc_list):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=doc_list)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(100)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,4.0,0.7078,"nan, \n\n , know, feel, help, think, bad, year...",[nan]
1,1,9.0,0.9957,"\n\n , feel, like, leave, know, house, think, ...","[slowly, fall, start, university, feel, like, ..."
2,2,9.0,0.9965,"\n\n , feel, like, leave, know, house, think, ...","[slowly, fall, start, university, feel, like, ..."
3,3,8.0,0.9825,"feel, , like, want, know, panic, \n\n , thin...","[feel, like, kind, anxiety, probably, public, ..."
4,4,7.0,0.6823,", \n\n , \n , panic, time, day, know, leave,...","[hey, guy, write, long, time, like, share, cur..."
...,...,...,...,...,...
95,95,9.0,0.9914,"\n\n , feel, like, leave, know, house, think, ...","[diagnose, month, tell, life, close, friend, s..."
96,96,9.0,0.9920,"\n\n , feel, like, leave, know, house, think, ...","[roughly, 15, year, able, manage, ocd, ritual,..."
97,97,0.0,0.9749,"\n\n , feel, like, people, help, want, time, k...","[trip, 2016, severe, anxiety, leave, house, tr..."
98,98,1.0,0.9882,"\n\n , feel, life, want, walk, year, panic, an...","[feed, terrified, seek, help, difficult, leave..."


In [84]:
df_dominant_topic.head(25)
df_dominant_topic.to_csv(r'ldatest.csv')

In [81]:
from collections import Counter
Counter(df_dominant_topic.Dominant_Topic)

Counter({4.0: 123,
         9.0: 84,
         8.0: 64,
         7.0: 61,
         5.0: 60,
         6.0: 50,
         3.0: 76,
         0.0: 81,
         1.0: 42,
         2.0: 8})

[[(0, 1)],
 [(1, 2),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 2),
  (7, 1),
  (8, 1),
  (9, 3),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 3),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 2),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 2),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 2),
  (36, 1),
  (37, 5),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 2),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 2),
  (67, 1),
  (68, 1),
  (69, 2),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 2),
  (80, 2),
  (81, 1),
  (82, 1),
  (83, 2)],
 [(1, 3),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 2),
  (7, 1),
  (8, 1),
  (9

In [83]:
newest_doc

Unnamed: 0,Body
0,
1,I have slowly fallen into this. Starting at a ...
2,I have slowly fallen into this. Starting at a ...
3,I feel like even if I didn't have any kind of ...
4,Hey guys writing after a long time...... I wou...
...,...
644,I feel like I should be learning a skill or le...
645,"As the title states, i'm lost in life and in a..."
646,Before quarantine I was starting to leave the ...
647,i’ve been on so many different medications ove...
