FOMC声明文の解釈は、ヘッジファンド、自己勘定取引会社、銀行などの短期取引戦略、キャリートレード、ポートフォリオの傾斜、企業の資金調達戦略に役立つものです。

## 依存関係のインストール

In [5]:
# !python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 479 kB/s eta 0:00:01    |██████████████████▍             | 6.9 MB 581 kB/s eta 0:00:09     |█████████████████████▋          | 8.1 MB 581 kB/s eta 0:00:07
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.3.1-py3-none-any.whl size=12047105 sha256=deeacc9410a4734528dcb12edb6ca2fd238517b1d25bf201386f04fe8d9c24a5
  Stored in directory: /tmp/pip-ephem-wheel-cache-374vmjj_/wheels/b7/0d/f0/7ecae8427c515065d75410989e15e5785dd3975fe06e795cd9
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-2.3.1
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spac

In [6]:
# install the FedTools package:
!pip install FedTools

# install chart studio (Plotly):
!pip install chart-studio

# import pandas and numpy for data wrangling:
import pandas as pd
import numpy as np

# from FedTools, import the MonetrayPolicyCommittee module to download statements:
from FedTools import MonetaryPolicyCommittee

# import spacy and displaycy for visualisation:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
from spacy import displacy

# import Counter for counting:
from collections import Counter

# import plotly for plotting:
import plotly.graph_objects as go




## FOMC statementsのダウンロード

In [13]:

def dataset_parsing():
  '''
  This function calls the MonetaryPolicyCommittee module of the FedTools package
  to collect FOMC Statements. These statements are parsed using SpaCy.

  Inputs: N/A.

  Outputs: dataset: a Pandas DataFrame which contains:

  'FOMC_Statements' - original FOMC Statements downloaded by FedTools.
  'tokenised_data' - tokenised FOMC Statements.
  'lemmatised_data' - lematised FOMC Statements.
  'part_of_speech' - part of speech tags from FOMC Statements.
  'named_entities' - the named entities detected within the FOMC Statements.
  'labels' - the corresponding labels associated with named_entities.
  'number_of_labels' - a dictionary displaying the number of each label detected.
  'items' - the number of times each item is detected within the FOMC Statements.

  '''

  # collect FOMC Statements into DataFrame called dataset:
  dataset = MonetaryPolicyCommittee().find_statements()

  # remove additional operators within the text:
  for i in range(len(dataset)):
    dataset.iloc[i,0] = dataset.iloc[i,0].replace('\\n','. ')
    dataset.iloc[i,0] = dataset.iloc[i,0].replace('\n',' ')
    dataset.iloc[i,0] = dataset.iloc[i,0].replace('\r',' ')
    dataset.iloc[i,0] = dataset.iloc[i,0].replace('\xa0',' ')

  # initialise empty lists:
  tokens = []
  lemma = []
  pos = []
  ents = []
  labels = []
  count = []
  items = []

  # for each document in the pipeline:
  for doc in nlp.pipe(dataset['FOMC_Statements'].astype('unicode').values, batch_size=50, n_threads=10):
      # if the document is successfully parsed:
      if doc.is_parsed:
          # append various data to appropriate categories:
          tokens.append([n.text for n in doc])
          lemma.append([n.lemma_ for n in doc])
          pos.append([n.pos_ for n in doc])
          ents.append([n.text for n in doc.ents])
          labels.append([n.label_ for n in doc.ents])
          count.append(Counter([n.label_ for n in doc.ents]))
          items.append(Counter([n.text for n in doc.ents]))

      # if document parsing fails, return 'None' to maintain DataFrame dimensions:
      else:
          tokens.append(None)
          lemma.append(None)
          pos.append(None)
          ents.append(None)
          labels.append(None)
          count.append(None)
          items.append(None)

  # now assign the lists columns within the dataframe:
  dataset['tokenised_data'] = tokens
  dataset['lemmatised_data'] = lemma
  dataset['part_of_speech'] = pos
  dataset['named_entities'] = ents
  dataset['labels'] = labels
  dataset['number_of_labels'] = count
  dataset['items'] = items

  return dataset

In [16]:
df = dataset_parsing()
df.head()

Constructing links.
Extracting the past 200 FOMC Statements.
Retrieving articles.
........................................................................................................................................................................................................

Unnamed: 0,FOMC_Statements,tokenised_data,lemmatised_data,part_of_speech,named_entities,labels,number_of_labels,items
1994-02-04,Chairman Alan Greenspan announced today that t...,"[Chairman, Alan, Greenspan, announced, today, ...","[Chairman, Alan, Greenspan, announce, today, t...","[PROPN, PROPN, PROPN, VERB, NOUN, SCONJ, DET, ...","[Alan Greenspan, today, the Federal Open Marke...","[PERSON, DATE, ORG, PERSON, ORG, ORDINAL, ORG,...","{'PERSON': 2, 'DATE': 2, 'ORG': 3, 'ORDINAL': 1}","{'Alan Greenspan': 1, 'today': 1, 'the Federal..."
1994-03-22,Chairman Alan Greenspan announced today that t...,"[Chairman, Alan, Greenspan, announced, today, ...","[Chairman, Alan, Greenspan, announce, today, t...","[PROPN, PROPN, PROPN, VERB, NOUN, SCONJ, DET, ...","[Alan Greenspan, today, the Federal Open Marke...","[PERSON, DATE, ORG]","{'PERSON': 1, 'DATE': 1, 'ORG': 1}","{'Alan Greenspan': 1, 'today': 1, 'the Federal..."
1994-04-18,The Federal Reserve today announced two action...,"[The, Federal, Reserve, today, announced, two,...","[the, Federal, Reserve, today, announce, two, ...","[DET, PROPN, PROPN, NOUN, VERB, NUM, NOUN, VER...","[The Federal Reserve, today, two, 3 percent, 3...","[ORG, DATE, CARDINAL, PERCENT, PERCENT, ORG, C...","{'ORG': 7, 'DATE': 3, 'CARDINAL': 3, 'PERCENT'...","{'The Federal Reserve': 1, 'today': 1, 'two': ..."
1994-05-17,Chairman Alan Greenspan announced today that t...,"[Chairman, Alan, Greenspan, announced, today, ...","[Chairman, Alan, Greenspan, announce, today, t...","[PROPN, PROPN, PROPN, VERB, NOUN, SCONJ, DET, ...","[Alan Greenspan, today, the Federal Reserve]","[PERSON, DATE, ORG]","{'PERSON': 1, 'DATE': 1, 'ORG': 1}","{'Alan Greenspan': 1, 'today': 1, 'the Federal..."
1994-08-16,The Federal Reserve announced today the follow...,"[The, Federal, Reserve, announced, today, the,...","[the, Federal, Reserve, announce, today, the, ...","[DET, PROPN, PROPN, VERB, NOUN, DET, VERB, ADJ...","[The Federal Reserve, today, The Federal Reser...","[ORG, DATE, ORG, ORG, NORP, ORG, GPE, GPE, GPE...","{'ORG': 5, 'DATE': 1, 'NORP': 1, 'GPE': 5}","{'The Federal Reserve': 2, 'today': 1, 'Board'..."


In [30]:
def generate_additional_information(dataset):
  '''
  This function generates additional information from the parsed documents, quantifying
  the usage of specific named entities within FOMC Statements.

  Inputs: dataset .

  Outputs: dataset: a Pandas DataFrame which contains:

  'person' - the number of times people are mentioned in each statement.
  'date' - the number of times dates are mentioned within each statement.
  'percent' - the number of times percentages are mentioned within each statement.
  'time' - the number of times a time is mentioned within each statement.
  'ordinal' - the number of times an 'ordinal' ie) "first" is mentioned within each statement.
  'organisations' - the number of times an organisation is mentioned within each statement.
  'money' - the number of times money is mentioned within each statement.
  'event' - the number of times an event is mentioned within each statement.
  'law' - the number of times a law is mentioned within each statement.
  'quantity' - the number of times a quantity is mentioned within each statement.
  'groups' - the number of times specific groups are mentioned within each statement.
  'information_content' -  the number of named entities detected within each statement.

  '''
  # call the function defined above:
#   dataset = dataset_parsing()

  # generate additional information through the detection of named entities:
  dataset['person'] = dataset['number_of_labels'].apply(lambda x: x.get('PERSON'))
  dataset['date'] = dataset['number_of_labels'].apply(lambda x: x.get('DATE'))
  dataset['percent'] = dataset['number_of_labels'].apply(lambda x: x.get('PERCENT'))
  dataset['product'] = dataset['number_of_labels'].apply(lambda x: x.get('PRODUCT'))
  dataset['time'] = dataset['number_of_labels'].apply(lambda x: x.get('TIME'))
  dataset['ordinal'] = dataset['number_of_labels'].apply(lambda x: x.get('ORDINAL'))
  dataset['organisations'] = dataset['number_of_labels'].apply(lambda x: x.get('ORG'))
  dataset['money'] = dataset['number_of_labels'].apply(lambda x: x.get('MONEY'))
  dataset['event'] = dataset['number_of_labels'].apply(lambda x: x.get('EVENT'))
  dataset['law'] = dataset['number_of_labels'].apply(lambda x: x.get('LAW'))
  dataset['quantity'] = dataset['number_of_labels'].apply(lambda x: x.get('QUANTITY'))
  dataset['groups'] = dataset['number_of_labels'].apply(lambda x: x.get('NORP'))

  # replace any 'NaN' values with 0, then calculate the 'information content',as defined
  # by the total number of named entities:
  dataset = dataset.replace(np.nan, 0)
  dataset['information_content'] = dataset.iloc[:,8:].sum(axis = 1)

  return dataset

In [32]:
df = generate_additional_information(df)
print(df.shape)
df.head()

(200, 21)


Unnamed: 0,FOMC_Statements,tokenised_data,lemmatised_data,part_of_speech,named_entities,labels,number_of_labels,items,person,date,...,product,time,ordinal,organisations,money,event,law,quantity,groups,information_content
1994-02-04,Chairman Alan Greenspan announced today that t...,"[Chairman, Alan, Greenspan, announced, today, ...","[Chairman, Alan, Greenspan, announce, today, t...","[PROPN, PROPN, PROPN, VERB, NOUN, SCONJ, DET, ...","[Alan Greenspan, today, the Federal Open Marke...","[PERSON, DATE, ORG, PERSON, ORG, ORDINAL, ORG,...","{'PERSON': 2, 'DATE': 2, 'ORG': 3, 'ORDINAL': 1}","{'Alan Greenspan': 1, 'today': 1, 'the Federal...",2.0,2.0,...,0.0,0.0,1.0,3,0.0,0.0,0.0,0,0.0,24.0
1994-03-22,Chairman Alan Greenspan announced today that t...,"[Chairman, Alan, Greenspan, announced, today, ...","[Chairman, Alan, Greenspan, announce, today, t...","[PROPN, PROPN, PROPN, VERB, NOUN, SCONJ, DET, ...","[Alan Greenspan, today, the Federal Open Marke...","[PERSON, DATE, ORG]","{'PERSON': 1, 'DATE': 1, 'ORG': 1}","{'Alan Greenspan': 1, 'today': 1, 'the Federal...",1.0,1.0,...,0.0,0.0,0.0,1,0.0,0.0,0.0,0,0.0,9.0
1994-04-18,The Federal Reserve today announced two action...,"[The, Federal, Reserve, today, announced, two,...","[the, Federal, Reserve, today, announce, two, ...","[DET, PROPN, PROPN, NOUN, VERB, NUM, NOUN, VER...","[The Federal Reserve, today, two, 3 percent, 3...","[ORG, DATE, CARDINAL, PERCENT, PERCENT, ORG, C...","{'ORG': 7, 'DATE': 3, 'CARDINAL': 3, 'PERCENT'...","{'The Federal Reserve': 1, 'today': 1, 'two': ...",0.0,3.0,...,0.0,0.0,0.0,7,0.0,0.0,0.0,0,1.0,39.0
1994-05-17,Chairman Alan Greenspan announced today that t...,"[Chairman, Alan, Greenspan, announced, today, ...","[Chairman, Alan, Greenspan, announce, today, t...","[PROPN, PROPN, PROPN, VERB, NOUN, SCONJ, DET, ...","[Alan Greenspan, today, the Federal Reserve]","[PERSON, DATE, ORG]","{'PERSON': 1, 'DATE': 1, 'ORG': 1}","{'Alan Greenspan': 1, 'today': 1, 'the Federal...",1.0,1.0,...,0.0,0.0,0.0,1,0.0,0.0,0.0,0,0.0,9.0
1994-08-16,The Federal Reserve announced today the follow...,"[The, Federal, Reserve, announced, today, the,...","[the, Federal, Reserve, announce, today, the, ...","[DET, PROPN, PROPN, VERB, NOUN, DET, VERB, ADJ...","[The Federal Reserve, today, The Federal Reser...","[ORG, DATE, ORG, ORG, NORP, ORG, GPE, GPE, GPE...","{'ORG': 5, 'DATE': 1, 'NORP': 1, 'GPE': 5}","{'The Federal Reserve': 2, 'today': 1, 'Board'...",0.0,1.0,...,0.0,0.0,0.0,5,0.0,0.0,0.0,0,1.0,21.0


In [33]:
def generate_chairperson(dataset):
  '''
  This function uses Named Entity Recognition in order to detect the presence of 
  chairpeople within the FOMC statements. 

  Inputs: dataset: a Pandas DataFrame as defined above.

  Outputs: dataset: a Pandas DataFrame which identifies the FOMC Chairperson.
  '''

  # try to detect specific names within 'items':
  dataset['Greenspan'] = dataset['items'].apply(lambda x: x.get('Alan Greenspan'))
  dataset['Bernanke'] = dataset['items'].apply(lambda x: x.get('Ben S. Bernanke'))
  dataset['Yellen'] = dataset['items'].apply(lambda x: x.get('Janet L. Yellen'))
  dataset['Powell'] = dataset['items'].apply(lambda x: x.get('Jerome H. Powell'))

  # replace all 'Nan' values with 0:
  dataset = dataset.replace(np.nan, 0)

  return dataset


In [34]:
def plot_figure(dataset):
  '''
  This function constructs a Plotly chart by calling the above functions to generate
  the dataset, and subsequently plotting relevant data. 
  '''

  # define the dataset as a global variable, which can be used outside of the function:
#   global dataset
  # call the above functions to generate the required data:
#   dataset = generate_additional_information()
  dataset = generate_chairperson(dataset)

  # initialise figure:
  fig = go.Figure()

  # add figure traces:
  fig.add_trace(go.Scatter(x = dataset.index, y = dataset['information_content'],
                           mode = 'lines',
                           name = 'Information Content',
                           connectgaps=True))
  
  fig.add_trace(go.Scatter(x = dataset.index, y = dataset['percent'],
                           mode = 'lines',
                           name = 'Number of times "Percentage" mentioned',
                           connectgaps=True))
  
  fig.add_trace(go.Scatter(x = dataset.index, y = dataset['person'],
                           mode = 'lines',
                           name = 'Number of People mentioned',
                           connectgaps=True))
  
  fig.add_trace(go.Scatter(x = dataset.index, y = dataset['money'],
                           mode = 'lines',
                           name = 'Number of times Money mentioned',
                           connectgaps=True))
  
  fig.add_trace(go.Scatter(x = dataset.index, y = dataset['quantity'],
                           mode = 'lines',
                           name = 'Number of Quantities mentioned',
                           connectgaps=True))
  
  fig.add_trace(go.Scatter(x = dataset.index, y = dataset['event'],
                           mode = 'lines',
                           name = 'Number of Events mentioned',
                           connectgaps=True))
  
  fig.add_trace(go.Scatter(x = dataset.index, y = dataset['organisations'],
                           mode = 'lines',
                           name = 'Number of Organisations mentioned',
                           connectgaps=True))

  # add a rangeslider and buttons:
  fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=5, label="5 Years", step="year", stepmode="backward"),
            dict(count=10, label="10 Years", step="year", stepmode="backward"),
            dict(count=15, label="15 Years", step="year", stepmode="backward"),
            dict(label="All", step="all")
        ]))) 

  # add a chart title and axis title:
  fig.update_layout(
    title="FOMC Named Entity Recognition",
    xaxis_title="Date",
    yaxis_title="",
    font=dict(
        family="Arial",
        size=11,
        color="#7f7f7f"
    ))
  
  # add toggle buttons for dataset display:
  fig.update_layout(
      updatemenus=[
          dict(
            buttons=list([
                  dict(
                    label = 'All',
                    method = 'update',
                    args = [{'visible': [True, True, True, True, True, True, True]}]
                  ),

                  dict(
                    label = 'Information Content',
                    method = 'update',
                    args = [{'visible': [True, False, False, False, False, False, False]}]
                  ),

                  dict(
                    label = 'Percentage mentions',
                    method = 'update',
                    args = [{'visible': [False, True, False, False, False, False, False,]}]
                  ),

                  dict(
                    label = 'People mentions',
                    method = 'update',
                    args = [{'visible': [False, False, True, False, False, False, False,]}]
                  ),

                  dict(
                    label = 'Money mentions',
                    method = 'update',
                    args = [{'visible': [False, False, False, True, False, False, False,]}]
                  ),

                  dict(
                    label = 'Quantity mentions',
                    method = 'update',
                    args = [{'visible': [False, False, False, False, True, False, False,]}]
                  ),

                  dict(
                    label = 'Event mentions',
                    method = 'update',
                    args = [{'visible': [False, False, False, False, False, True, False,]}]
                  ),

                  dict(
                    label = 'Organisation mentions',
                    method = 'update',
                    args = [{'visible': [False, False, False, False, False, False, True]}]
                  ),
              ]),
              direction="down",
              pad={"r": 10, "t": 10},
              showactive=True,
              x=1.0,
              xanchor="right",
              y=1.2,
              yanchor="top"
          ),])
  
  return fig.show()


In [37]:
plot_figure(df)

In [38]:
displacy.render(nlp(dataset['FOMC_Statements'][103]), jupyter = True, style = 'ent')