In [3]:
import os
import pandas as pd

RATE_SPEECHES_FILE = "joined.parquet.gzip"
assert os.path.exists(RATE_SPEECHES_FILE), f"file not present: {RATE_SPEECHES_FILE}"

rate_speeches = pd.read_parquet(RATE_SPEECHES_FILE)


In [4]:
import nltk
nltk.download('punkt_tab')

from nltk import word_tokenize
from nltk.tokenize import sent_tokenize


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [5]:
%%time

def rate_speeches_text_length():
  '''
  reload rate_speeches or update it with text lengths
  '''
  global rate_speeches
  RATE_SPEECHES_TL_FILE = 'rate_speeches.text_length.parquet.gzip'

  if os.path.exists(RATE_SPEECHES_TL_FILE):
    print(f"loading {RATE_SPEECHES_TL_FILE}...")
    rate_speeches = pd.read_parquet(RATE_SPEECHES_TL_FILE)
  else:
    rate_speeches['text_length'] = rate_speeches['extracted_text'].apply(lambda x: len(word_tokenize(x)))
    rate_speeches.to_parquet(RATE_SPEECHES_TL_FILE, compression='gzip')

rate_speeches_text_length();

print(rate_speeches['text_length'].describe())
rate_speeches['extracted_text'][0]


loading rate_speeches.text_length.parquet.gzip...
count     3434.000000
mean      3307.266453
std       2296.389089
min          0.000000
25%       1809.000000
50%       2753.000000
75%       4115.750000
max      26091.000000
Name: text_length, dtype: float64
CPU times: user 478 ms, sys: 258 ms, total: 736 ms
Wall time: 678 ms


'Willem F. Duisenberg, President of the European Central Bank,Thursday, 4 March 1999Ladies and gentlemen, the Vice-President and I are here today to report on the outcome of today\'s meetings of the Governing Council and of the General Council of the European Central Bank.Let me start with the Governing Council\'s discussion onrecent economic developments and the decisions that the Governing Council has taken today in the field of monetary policy.After a comprehensive and careful examination of recent trends and ongoing evaluations of the economic outlook for the euro area economy, there was consensus that some of the risks identified earlier, in particular with regard to real GDP growth, had materialised in the fourth quarter of 1998. There was also consensus that the impact of these developments on the balance of risks for price stability would need to be examined further in the context of the monetary policy strategy adopted by the Eurosystem. At this juncture, taking into account a

In [6]:

def rate_speeches_sent_tokenize():
  '''
  reload rate_speeches or update it with `sent_tokenize`
  '''
  global rate_speeches
  RATE_SPEECHES_TOKENIZED_FILE = 'rate_speeches.sent_tokenize.parquet.gzip'

  if os.path.exists(RATE_SPEECHES_TOKENIZED_FILE):
    print(f"loading {RATE_SPEECHES_TOKENIZED_FILE}...")
    rate_speeches = pd.read_parquet(RATE_SPEECHES_TOKENIZED_FILE)
  else:
    rate_speeches['extracted_text'] = rate_speeches['extracted_text'].apply(sent_tokenize)
    rate_speeches.to_parquet(RATE_SPEECHES_TOKENIZED_FILE, compression='gzip')

rate_speeches_sent_tokenize()


loading rate_speeches.sent_tokenize.parquet.gzip...


In [19]:
import os
import nltk
import gensim
import numpy as np
import unicodedata

from itertools import groupby
from unicodedata import category as unicat

from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet as wn
from nltk.chunk import tree2conlltags
from nltk.probability import FreqDist
from nltk.chunk.regexp import RegexpParser
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
from tensorflow.keras.utils import pad_sequences


class KeyphraseExtractor(BaseEstimator, TransformerMixin):
    """
    Extract adverbial and adjective phrases, and transform
    documents into lists of these keyphrases, with a total
    keyphrase lexicon limited by the nfeatures parameter
    and a document length limited/padded to doclen
    """
    def __init__(self, nfeatures: int, doclen: int):
        self.grammar = r'KT: {(<RB.> <JJ.*>|<VB.*>|<RB.*>)|(<JJ> <NN.*>)}'
        # self.grammar = r'KT: {(<RB.*> <VB.>|<RB.>|<JJ.> <NN.*>)}'
        # self.grammar = r'KT: {<RB.>|<JJ.>}'
        self.chunker = RegexpParser(self.grammar)
        self.nfeatures = nfeatures
        self.doclen = doclen
        self._curr = 0

    def normalize(self, sent):
        """
        Removes punctuation from a tokenized/tagged sentence and
        lowercases words.
        """
        is_punct = lambda word: all(unicat(c).startswith('P') for c in word)
        sent = filter(lambda t: not is_punct(t[0]), sent)
        sent = map(lambda t: (t[0].lower(), t[1]), sent)
        return list(sent)

    def extract_candidate_phrases(self, sents, call_no):
        """
        For a document, parse sentences using our chunker created by
        our grammar, converting the parse tree into a tagged sequence.
        Extract phrases, rejoin with a space, and yield the document
        represented as a list of it's keyphrases.
        """
        print(f">> \t [{call_no}] extract_candidate_phrases...")

        for i, sent in enumerate(sents):
            tokens = word_tokenize(sent)
            pos_tags = pos_tag(tokens)
            normalized = self.normalize(pos_tags)
            # print(f"[{self._curr}] sent #{i}: {sent}")
            # print(f"[{self._curr}] \t {normalized}")

            chunks = tree2conlltags(self.chunker.parse(normalized))
            if not chunks or all(chunk[-1] == 'O' for chunk in chunks):
                #print(f"No valid chunks found in sentence: {sent}")
                continue

            phrases = [
                " ".join(word for word, pos, chunk in group).lower()
                for key, group in groupby(
                    chunks, lambda term: term[-1] != 'O'
                ) if key
            ]
            for phrase in phrases:
                yield phrase

    def fit(self, documents, y=None):
        return self

    def get_lexicon(self, keydocs):
        """
        Build a lexicon of size nfeatures
        """
        keyphrases = [keyphrase for doc in keydocs for keyphrase in doc]
        print("Keyphrases:", keyphrases[:5])
        fdist = FreqDist(keyphrases)
        counts = fdist.most_common(self.nfeatures)
        print("Frequency counts:", counts[:5])
        lexicon = [phrase for phrase, count in counts]
        return {phrase: idx+1 for idx, phrase in enumerate(lexicon)}

    def clip(self, keydoc, lexicon):
        """
        Remove keyphrases from documents that aren't in the lexicon
        """
        return [lexicon[keyphrase] for keyphrase in keydoc
            if keyphrase in lexicon.keys()]

    def transform(self, documents):
      self._curr += 1
      print(f">> [{self._curr}] KeyphraseExtractor.transform: {len(documents)}...")
      docs = [list(self.extract_candidate_phrases(doc, i)) for i, doc in enumerate(documents)]
      lexicon = self.get_lexicon(docs)
      clipped = [list(self.clip(doc, lexicon)) for doc in docs]
      return pad_sequences(clipped, maxlen=self.doclen)

In [8]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

def data_set(view):
  X = view['extracted_text']
  display(X.describe())

  # view = rate_speeches[:doclen]
  labels = view["Direction"].values
  label_mapping = {label: idx for idx, label in enumerate(sorted(set(labels)))}
  encoded_labels = np.array([label_mapping[label] for label in labels])

  # convert to one-hot encoding
  num_classes = len(label_mapping)
  print(f"num_classes: {num_classes}")
  y = to_categorical(encoded_labels, num_classes=num_classes)
  return X, y, num_classes


In [9]:
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [10]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dropout, Dense, Input
from sklearn.pipeline import Pipeline
from scikeras.wrappers import KerasClassifier

def create_lstm_model(vocab_size: int, input_length: int, num_classes: int):
    model = Sequential([
        Input(shape=(input_length,), name="input_layer"),  # explicit input layer
        Embedding(input_dim=vocab_size, output_dim=128, name="embedding_layer"),
        LSTM(128, return_sequences=True, name="lstm_layer_1"),
        Dropout(0.2, name="dropout_layer_1"),
        LSTM(64, name="lstm_layer_2"),
        Dropout(0.2, name="dropout_layer_2"),
        Dense(num_classes, activation="softmax", name="output_layer")
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [11]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [12]:

# X, y, num_classes = data_set(rate_speeches[:60])

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# keyphrase_extractor = KeyphraseExtractor(nfeatures=10000, doclen=100)
# X_train = keyphrase_extractor.fit_transform(X_train)
# X_val = keyphrase_extractor.transform(X_val)


In [13]:

# model = create_lstm_model(vocab_size=10000, input_length=100, num_classes=num_classes);

# history = model.fit(
#     X_train, y_train,
#     validation_data=(X_val, y_val),
#     epochs=10,
#     batch_size=32
# )


In [14]:

# loss, accuracy = model.evaluate(X_val, y_val)
# print(f"Validation Accuracy: {accuracy:.2f}")


In [15]:
class DebugTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        print("Data Shape Before LSTM:", X.shape)
        # print("Sample Data Before LSTM:", X[0])  # print a sample
        return X


In [16]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from joblib import Memory

X, y, num_classes = data_set(rate_speeches)


Unnamed: 0,extracted_text
count,3434
unique,3434
top,"[Willem F. Duisenberg, President of the Europe..."
freq,1


num_classes: 3


In [20]:

# set up caching directory
memory = Memory(location="cache_directory", verbose=0)

pipeline = Pipeline([
    ("keyphrase_extractor", KeyphraseExtractor(nfeatures=10000, doclen=100)),
    ('debug', DebugTransformer()),
    ("lstm_classifier", KerasClassifier(
        build_fn=create_lstm_model,
        vocab_size=10000,
        input_length=100,
        num_classes=num_classes,
        epochs=10,
        batch_size=32,
        verbose=1)
    )
], memory=memory)


In [21]:

scores = cross_val_score(pipeline, X, y, cv=4, scoring="accuracy")


>> [1] KeyphraseExtractor.transform: 2575...
>> 	 [0] KeyphraseExtractor.extract_candidate_phrases
>> 	 [1] KeyphraseExtractor.extract_candidate_phrases
>> 	 [2] KeyphraseExtractor.extract_candidate_phrases
>> 	 [3] KeyphraseExtractor.extract_candidate_phrases
>> 	 [4] KeyphraseExtractor.extract_candidate_phrases
>> 	 [5] KeyphraseExtractor.extract_candidate_phrases
>> 	 [6] KeyphraseExtractor.extract_candidate_phrases
>> 	 [7] KeyphraseExtractor.extract_candidate_phrases
>> 	 [8] KeyphraseExtractor.extract_candidate_phrases
>> 	 [9] KeyphraseExtractor.extract_candidate_phrases
>> 	 [10] KeyphraseExtractor.extract_candidate_phrases
>> 	 [11] KeyphraseExtractor.extract_candidate_phrases
>> 	 [12] KeyphraseExtractor.extract_candidate_phrases
>> 	 [13] KeyphraseExtractor.extract_candidate_phrases
>> 	 [14] KeyphraseExtractor.extract_candidate_phrases
>> 	 [15] KeyphraseExtractor.extract_candidate_phrases
>> 	 [16] KeyphraseExtractor.extract_candidate_phrases
>> 	 [17] KeyphraseExtractor.e

KeyboardInterrupt: 

In [52]:
print(f"Cross-validation scores: {scores}")
print(f"Mean accuracy: {np.mean(scores):.2f}")


Cross-validation scores: [0.68 0.84 0.72 0.68]
Mean accuracy: 0.73
