In [27]:
import re
import numpy as np
import pandas as pd

from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors

In [28]:
# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel

In [29]:
%matplotlib inline

warnings.filterwarnings("ignore",category=DeprecationWarning)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [30]:
# NLTK Stop words
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 
                   'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 
                   'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 
                   'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 
                   'need', 'even', 'right', 'line', 'even', 'also', 'may', 
                   'take', 'come', 'http','https','www','com','@','...','…', 'var', 
                   'ref', 'document', 'referrer', 'var', 'bbpath', 'href', 'archive', 
                   'function', 'beige','book', 'federal', 'reserve', 'summary', 
                   'indexof', 'backtoresults','history', 'go', 'else', 'window', 
                   'location', 'back', 'search','this', 'report', 'prepared', 'federal', 
                   'reserve', 'bank', 'summarizes','january', 'february', 'march', 'april', 
                   'may', 'june', 'july', 'august','september', 'october', 'november', 
                   'december', 'atlanta', 'boston', 'chicago','cleveland', 'dallas', 
                   'kansas', 'minneapolis', 'york', 'philadelphia', 'richmond', 'san', 
                   'francisco', 'st', 'louis', 'officials', 'comments', 
                   'views', 'commentary', 'national', 'based', 'information', 'collected',
                   'received', 'contacts', 'outside', 'system', 'page', 'url', 'link', 
                   'district', 'districts', 'activity',
                  ])

In [31]:
df = pd.read_csv('./data/complete_dataset.csv')
df.head()

Unnamed: 0,district_report,date,district,rate,rate_change,is_up
0,\n\r\n var ref = document.referrer;\r\n ...,2019-04,at,2.532381,,0
1,\n\r\n var ref = document.referrer;\r\n ...,2019-03,at,2.570952,0.015231,1
2,\n\r\n var ref = document.referrer;\r\n ...,2019-01,at,2.71381,0.055566,1
3,\n\r\n var ref = document.referrer;\r\n ...,2018-12,at,2.832632,0.043784,1
4,\n\r\n var ref = document.referrer;\r\n ...,2018-10,at,3.152273,0.112842,1


In [32]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  

# Convert to list
data = df['district_report'].values.tolist()
data_words = list(sent_to_words(data))
#print(data_words[:1])

In [33]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [34]:
def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

data_ready = process_words(data_words)  # processed Text Data!

In [35]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

In [36]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=20190511,
                                           update_every=1,
                                           chunksize=50,
                                           passes=10,
                                           alpha='auto',
                                           iterations=50,
                                           per_word_topics=True)

In [37]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [38]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_ready)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head()

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,1.0,0.6633,"increase, price, demand, estate, service, real...","[economic, sixth, business, indicate, economic..."
1,1,1.0,0.6734,"increase, price, demand, estate, service, real...","[economic, sixth, business, economic, continue..."
2,2,1.0,0.6796,"increase, price, demand, estate, service, real...","[economic, sixth, business, remain, largely, p..."
3,3,1.0,0.6348,"increase, price, demand, estate, service, real...","[economic, sixth, business, describe, economic..."
4,4,1.0,0.638,"increase, price, demand, estate, service, real...","[economic, sixth, business, indicate, economic..."


In [39]:
df_clean = df.join(df_dominant_topic)

In [40]:
from bs4 import BeautifulSoup 

In [41]:
def cleaner(raw_text):
    # Function to convert a document to a string of words
    
    # 1. Remove HTML.
    document = BeautifulSoup(raw_text).get_text()
    
    # 2. Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", document)
    
    # 3. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    
    
    # 4. In Python, searching a set is much faster than searching
    # a list, so convert the stop words to a set.
    stops = set(stopwords.words('english')) 
    extra = ['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 
                   'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 
                   'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 
                   'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 
                   'need', 'even', 'right', 'line', 'even', 'also', 'may', 
                   'take', 'come', 'http','https','www','com','@','...','…', 'var', 
                   'ref', 'document', 'referrer', 'var', 'bbpath', 'href', 'archive', 
                   'function', 'beige','book', 'federal', 'reserve', 'summary', 
                   'indexof', 'backtoresults','history', 'go', 'else', 'window', 
                   'location', 'back', 'search','this', 'report', 'prepared', 'federal', 
                   'reserve', 'bank', 'summarizes','january', 'february', 'march', 'april', 
                   'may', 'june', 'july', 'august','september', 'october', 'november', 
                   'december', 'atlanta', 'boston', 'chicago','cleveland', 'dallas', 
                   'kansas', 'minneapolis', 'york', 'philadelphia', 'richmond', 'san', 
                   'francisco', 'st', 'louis', 'officials', 'comments', 
                   'views', 'commentary', 'national', 'based', 'information', 'collected',
                   'received', 'contacts', 'outside', 'system', 'page', 'url', 'link', 
                   'district', 'districts', 'activity']
    # 5. Remove stop words.
    meaningful_words = [word for word in words if not word in stops and word not in extra]
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words))

In [42]:
df_clean['clean_text'] = df_clean['district_report'].apply(cleaner)

In [43]:
df_clean.head()

Unnamed: 0,district_report,date,district,rate,rate_change,is_up,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,clean_text
0,\n\r\n var ref = document.referrer;\r\n ...,2019-04,at,2.532381,,0,0,1.0,0.6633,"increase, price, demand, estate, service, real...","[economic, sixth, business, indicate, economic...",economic sixth business indicated economic con...
1,\n\r\n var ref = document.referrer;\r\n ...,2019-03,at,2.570952,0.015231,1,1,1.0,0.6734,"increase, price, demand, estate, service, real...","[economic, sixth, business, economic, continue...",economic sixth business reported economic cont...
2,\n\r\n var ref = document.referrer;\r\n ...,2019-01,at,2.71381,0.055566,1,2,1.0,0.6796,"increase, price, demand, estate, service, real...","[economic, sixth, business, remain, largely, p...",economic sixth business remained largely posit...
3,\n\r\n var ref = document.referrer;\r\n ...,2018-12,at,2.832632,0.043784,1,3,1.0,0.6348,"increase, price, demand, estate, service, real...","[economic, sixth, business, describe, economic...",economic reports sixth business described econ...
4,\n\r\n var ref = document.referrer;\r\n ...,2018-10,at,3.152273,0.112842,1,4,1.0,0.638,"increase, price, demand, estate, service, real...","[economic, sixth, business, indicate, economic...",economic reports sixth business indicated econ...


In [None]:
#df_clean['Text'][0]

In [26]:
#df_clean['clean_text'][0]

'economic sixth business indicated economic conditions improved albeit modest pace previous majority expect growth continue relatively pace next months labor market remained tight wage growth remained subdued jobs exception demand hard fill positions firms noted nonlabor input cost pressures muted though observed increases related tariffs freight construction costs overall retail sales grew slightly automotive dealers remarked demand vehicles weakened since previous hospitality reported solid residential real estate indicated improvement home sales increased supply reduced price pressures since end last year balance commercial real estate conditions continued advance manufacturers cited increases new orders production levels bankers noted banking remained steady employment wages business maintained continued add staffing levels several communicated hiring remained limited tight labor markets firms mentioned setting satellite locations larger metropolitan cities moving populated suburba

In [46]:
df_clean.to_csv('./data/complete_clean_text.csv', index=False)