In [1]:
#pip install beautifulsoup4
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re # regular expression
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
import string
from sklearn.feature_extraction.text import CountVectorizer

from datetime import datetime
import pandas as pd 
import numpy as np
import os
import time
cwd = os.getcwd(); print(cwd)

# Run below to 
#%run dev_functions.py # last error 1980-09

c:\Users\justi\Documents\GitHub\Thread-620\src


### Resources
[Python - BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#getting-help)

## Summary of Commentary on Current Economic Conditions by Federal Reserve District
### Data sources
- [Federal Reserve - Board Archive](https://www.federalreserve.gov/monetarypolicy/beige-book-archive.htm)
- Archive from the [Federal Reserve Bank of Minneapolis](https://www.minneapolisfed.org/region-and-community/regional-economic-indicators/beige-book-archive)  
    - Archive [List - National Summary](https://www.minneapolisfed.org/region-and-community/regional-economic-indicators/beige-book-archive)

## Database split
Depending on how the format has changed over the years
* 1970 May - 1980 August: No format

# (1) 1970 May - 1980 August

## Start with National Summary

In [2]:
df_fin = pd.read_csv("data_beigebook.txt")
# Process desk files
df_fin['section'] = [str(each_entry).lower() for each_entry in df_fin['category']];
df_fin['section'] = [str(each_entry).replace("\n", "") for each_entry in df_fin['section']];
df_fin['section'] = [str(each_entry).strip() for each_entry in df_fin['section']];

df_fin[['section']].value_counts()

# Evaluate what goes in 
#df_fin['category'].value_counts().nlargest(20)

# Join with matched for higher level overview
df_fin_match = pd.read_csv("data_beigebook_match.csv", dtype =str)
df_fin = pd.merge(df_fin, df_fin_match, how = 'left', on= 'section')

#df_fin.loc[(df_fin['section'] == '28-Jan-81'), "text"].iloc[0]

list(df_fin.columns)

['time', 'region', 'category', 'text', 'section', 'count', 'section_overview']

In [3]:
df_fin[['region','section_overview']].value_counts()

region         section_overview                              
national       overview                                          882
kansas city    agriculture and natural resource                  534
minneapolis    agriculture and natural resource                  483
cleveland      manufacturing and other business activity         467
richmond       manufacturing and other business activity         429
                                                                ... 
san francisco  business activity                                   6
minneapolis    business activity                                   6
new york       agriculture and natural resource                    4
minneapolis    minority- and women-owned business enterprises      3
atlanta        business sentiment and outlook                      1
Length: 126, dtype: int64

In [4]:
df_fin.head()

Unnamed: 0,time,region,category,text,section,count,section_overview
0,1970-05-20,national,overview,This initial report of economic conditions in ...,overview,4511,overview
1,1970-06-17,national,overview,Comments on economic conditions in the twelve ...,overview,4511,overview
2,1970-07-15,national,overview,"Current comment by businessmen and bankers, as...",overview,4511,overview
3,1970-08-12,national,overview,The consensus of the reports by the twelve Fed...,overview,4511,overview
4,1970-09-09,national,overview,The reports in this Redbook are more optimisti...,overview,4511,overview


## Join Economic data to be used to compare - predictive.

1. Unversity of Michigan consumer sentiment
2. University of Michgian inflation expectation

## Clean up texts
1. Remove regular expressions
2. Tokenize

In [5]:
df_fin['text_clean'] = [str(each_entry).replace("\n", "") for each_entry in df_fin['text']];
df_fin['text_clean'] = [str(each_entry).lower() for each_entry in df_fin['text']];

# Remove stepwords for tokenization

## Example of extracting features

1. Vectorizer - use of n-grams
2. Create array of sentences.
3. PoS (Part-of-Speech) tagging: (LINK)[https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html]

In [18]:
tst = ''.join(df_fin.loc[(df_fin.time == '1970-06-17'), 'text_clean'])
str_txt = sent_tokenize(tst)
str_txt[0:5]

['comments on economic conditions in the twelve federal reserve districts indicate that in most districts bankers and businessmen find economic activity has been weakening, and they generally expected the decline will continue.',
 'in virtually all districts, unemployment is rising, and in many, labor markets are easing noticeably.',
 'retail trade is weaker almost everywhere, and consumers are "downgrading" and bargain-hunting.',
 'a few districts report large cuts in capital spending.',
 'in a number of districts, special note was made of the profit squeeze that is affecting many businesses, and in some, concern was expressed about a decline in corporate liquidity.']

In [None]:
# PoS tagging - figure out the word counts of specifics for sentiments?
# JJ / JJR - magnitude
# NNS / VBP / VBG / NN / VB

In [36]:
nltk.pos_tag(word_tokenize(str_txt[0]))

[('comments', 'NNS'),
 ('on', 'IN'),
 ('economic', 'JJ'),
 ('conditions', 'NNS'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('twelve', 'JJ'),
 ('federal', 'JJ'),
 ('reserve', 'NN'),
 ('districts', 'NNS'),
 ('indicate', 'VBP'),
 ('that', 'IN'),
 ('in', 'IN'),
 ('most', 'JJS'),
 ('districts', 'NNS'),
 ('bankers', 'NNS'),
 ('and', 'CC'),
 ('businessmen', 'NNS'),
 ('find', 'VBP'),
 ('economic', 'JJ'),
 ('activity', 'NN'),
 ('has', 'VBZ'),
 ('been', 'VBN'),
 ('weakening', 'VBG'),
 (',', ','),
 ('and', 'CC'),
 ('they', 'PRP'),
 ('generally', 'RB'),
 ('expected', 'VBD'),
 ('the', 'DT'),
 ('decline', 'NN'),
 ('will', 'MD'),
 ('continue', 'VB'),
 ('.', '.')]

In [42]:
str_txt

is considerable agreement that (1) consumers remain pessimistic, (2) outlays for plant and equipment are being cut back or deferred, (3) labor markets are easing but wage costs are not, and (4) growth in the money supply is too fast.',
 'reports from around the district indicate widespread pessimism on the part of consumers.',
 'large department stores in the region report poor sales for large luxury items all the way down to small inexpensive goods.',
 'one department store executive quipped that "... only $12.95 shirts on sale for $3.98 are moving—and even then customers aren\'t waiting in line".',
 'hence, merchandising firms are being extremely cautious in stocking for fall and winter.',
 'there is increasing evidence that industrial firms in pennsylvania, new jersey, and delaware are canceling and postponing outlays for new plant and equipment.',
 'a canvassing of large manufacturers in the third district shows that since april there has been a marked cutback in spending plans for

In [41]:
tst = ''.join(df_fin.loc[(df_fin.time == '1970-06-17'), 'text_clean'])
str_txt = sent_tokenize(tst)

count_vect = CountVectorizer(analyzer='word', ngram_range=(2, 3))
bow_rep = count_vect.fit_transform(str_txt)



bow_rep.toarray()
count_vect.vocabulary_



 policy': 8557,
 'in only': 5988,
 'only one': 9035,
 'one federal': 8992,
 'reserve district': 10568,
 'district st': 3784,
 'st louis': 11505,
 'louis was': 7297,
 'was recent': 14152,
 'recent economic': 10181,
 'activity regarded': 221,
 'regarded as': 10336,
 'as good': 1381,
 'good and': 5181,
 'in kansas': 5929,
 'kansas city': 6895,
 'city it': 2624,
 'it was': 6806,
 'was deemed': 14111,
 'deemed fair': 3403,
 'fair to': 4462,
 'to good': 13365,
 'in only one': 5989,
 'only one federal': 9037,
 'one federal reserve': 8993,
 'federal reserve district': 4532,
 'reserve district st': 10571,
 'district st louis': 3785,
 'st louis was': 11507,
 'louis was recent': 7298,
 'was recent economic': 14153,
 'recent economic activity': 10182,
 'economic activity regarded': 3997,
 'activity regarded as': 222,
 'regarded as good': 10338,
 'as good and': 1382,
 'good and in': 5182,
 'and in kansas': 711,
 'in kansas city': 5930,
 'kansas city it': 6898,
 'city it was': 2625,
 'it was deemed'

In [90]:
tst = word_tokenize(df_fin['text_clean'].iloc[0])
#[tst.remove(stop_words) for stop_words in set(stopwords.words("english")) if stop_words in tst]

for stop_words in list(stopwords.words("english")):
    try: 
        tst.remove(stop_words)
    except: # Do nothing
        continue
#tst.remove("in")

tkntst

token_txt = [word for word in tst if not word in stopwords.words()]
token_txt

def rm_stopwords(text):
    stop_words = set(stopwords.words("english"))
    text_return = text
    for idx, strtxt in enumerate(text):
        if strtxt in stop_words:
            text_return = text_return.replace(strtxt, "")
    return text_return

#df_fin['text_clean'] = [str(each_entry).lower() for each_entry in df_fin['text_clean'] if each_entry not in mystopwords];
rm_stopwords(df_fin['text_clean'].iloc[0])

# Tokenize as a whole

#for sentence in df_fin['text']:
#sent_tokenize(df_fin['text'].iloc[0])
#word_tokenize(df_fin['text'].iloc[0])

def preprocess_corpus(texts):
    mystopwords = set(stopwords.words("english"))
    def remove_stops_digits(tokens):
        return [token.lower() for token in tokens if token not in mystopwords or not token.isdigit() and token not in punctuation]
    return [remove_stops_digits(word_tokenize(text)) for text in texts]
preprocess_corpus(df_fin['text'].iloc[0])

In [None]:
# The cleaning - better off with excel file and copy-paste

# to further think about
# Read through couple of the notes and see the text and judge

# note that for example of sf fed 2012 has some error pulling data, 
#some overviews are noted as 

# also
#retail -- consumer activities?

# Capital spend (in manufacturing) and business activities?


## Collect New data

1. Enter dates,
2. Process the cleanup
3. Append the data