## LSTM

In [None]:
!pip install keras
!pip install tensorflow
!pip install scikeras

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from datetime import datetime

In [None]:
joined = pd.read_parquet('joined.parquet.gzip')

In [45]:
# analyze text lengths
joined['text_length'] = joined['extracted_text'].apply(lambda x: len(word_tokenize(x)))
print(joined['text_length'].describe())

count     3434.000000
mean      3307.266453
std       2296.389089
min          0.000000
25%       1809.000000
50%       2753.000000
75%       4115.750000
max      26091.000000
Name: text_length, dtype: float64


In [46]:
percentile_95 = joined['text_length'].quantile(0.95)
print(f"95th Percentile of text_length: {percentile_95}")

95th Percentile of text_length: 7619.0


In [133]:
import re
import nltk
from nltk.tokenize import sent_tokenize

joined_test = joined[:100]
#joined_test['extracted_text'] = joined_test['extracted_text'].apply(lambda text: re.sub(r'(?<!\s)([.!?])(?=\D)', r'\1 ', text))
joined_test['extracted_text'] = joined_test['extracted_text'].apply(sent_tokenize)
joined_test['extracted_text'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  joined_test['extracted_text'] = joined_test['extracted_text'].apply(sent_tokenize)


0    [Willem F. Duisenberg, President of the Europe...
1    [Speech by Christian Noyer, Vice-President of ...
2    [Speech by Tommaso Padoa-Schioppa Member of th...
3    [Eugenio Domingo Solans, Member of the Executi...
4    [Speech by Dr. Willem F. Duisenberg, President...
Name: extracted_text, dtype: object

In [134]:
joined_test['extracted_text'][0]

["Willem F. Duisenberg, President of the European Central Bank,Thursday, 4 March 1999Ladies and gentlemen, the Vice-President and I are here today to report on the outcome of today's meetings of the Governing Council and of the General Council of the European Central Bank.Let me start with the Governing Council's discussion onrecent economic developments and the decisions that the Governing Council has taken today in the field of monetary policy.After a comprehensive and careful examination of recent trends and ongoing evaluations of the economic outlook for the euro area economy, there was consensus that some of the risks identified earlier, in particular with regard to real GDP growth, had materialised in the fourth quarter of 1998.",
 'There was also consensus that the impact of these developments on the balance of risks for price stability would need to be examined further in the context of the monetary policy strategy adopted by the Eurosystem.',
 'At this juncture, taking into ac

In [135]:
num_empty = joined_test['extracted_text'].apply(lambda x: len(x) == 0).sum()
print(f"Number of empty items: {num_empty}")

Number of empty items: 0


In [136]:
for doc in joined_test['extracted_text']:
    assert isinstance(doc, list), f"Document is not a list: {type(doc)}"
    for sent in doc:
        assert isinstance(sent, str), f"Sentence is not a string: {type(sent)}"
print("All documents and sentences are properly structured!")

All documents and sentences are properly structured!


In [138]:
import os
import nltk
import gensim
import numpy as np
import unicodedata

from itertools import groupby
from unicodedata import category as unicat

from nltk.corpus import wordnet as wn
from nltk.chunk import tree2conlltags
from nltk.probability import FreqDist
from nltk.chunk.regexp import RegexpParser
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
from tensorflow.keras.utils import pad_sequences


class KeyphraseExtractor(BaseEstimator, TransformerMixin):
    """
    Extract adverbial and adjective phrases, and transform
    documents into lists of these keyphrases, with a total
    keyphrase lexicon limited by the nfeatures parameter
    and a document length limited/padded to doclen
    """
    def __init__(self, nfeatures=100000, doclen=7619):
        self.grammar = r'KT: {(<RB.> <JJ.*>|<VB.*>|<RB.*>)|(<JJ> <NN.*>)}'
        # self.grammar = r'KT: {(<RB.*> <VB.>|<RB.>|<JJ.> <NN.*>)}'
        # self.grammar = r'KT: {<RB.>|<JJ.>}'
        self.chunker = RegexpParser(self.grammar)
        self.nfeatures = nfeatures
        self.doclen = doclen

    def normalize(self, sent):
        """
        Removes punctuation from a tokenized/tagged sentence and
        lowercases words.
        """
        is_punct = lambda word: all(unicat(c).startswith('P') for c in word)
        sent = filter(lambda t: not is_punct(t[0]), sent)
        sent = map(lambda t: (t[0].lower(), t[1]), sent)
        return list(sent)

    def extract_candidate_phrases(self, sents):
        """
        For a document, parse sentences using our chunker created by
        our grammar, converting the parse tree into a tagged sequence.
        Extract phrases, rejoin with a space, and yield the document
        represented as a list of it's keyphrases.
        """
        for sent in sents:
            tokens = word_tokenize(sent)
            pos_tags = pos_tag(tokens)
            normalized = self.normalize(pos_tags)
            #print(sent)
            #print(normalized)
            
            chunks = tree2conlltags(self.chunker.parse(normalized))
            if not chunks or all(chunk[-1] == 'O' for chunk in chunks):
                #print(f"No valid chunks found in sentence: {sent}")
                continue
            
            phrases = [
                " ".join(word for word, pos, chunk in group).lower()
                for key, group in groupby(
                    chunks, lambda term: term[-1] != 'O'
                ) if key
            ]
            for phrase in phrases:
                yield phrase

    def fit(self, documents, y=None):
        return self

    def get_lexicon(self, keydocs):
        """
        Build a lexicon of size nfeatures
        """
        keyphrases = [keyphrase for doc in keydocs for keyphrase in doc]
        print("Keyphrases:", keyphrases[:5])
        fdist = FreqDist(keyphrases)
        counts = fdist.most_common(self.nfeatures)
        print("Frequency counts:", counts[:5])
        lexicon = [phrase for phrase, count in counts]
        return {phrase: idx+1 for idx, phrase in enumerate(lexicon)}

    def clip(self, keydoc, lexicon):
        """
        Remove keyphrases from documents that aren't in the lexicon
        """
        return [lexicon[keyphrase] for keyphrase in keydoc
            if keyphrase in lexicon.keys()]

    def transform(self, documents):
        docs = [list(self.extract_candidate_phrases(doc)) for doc in documents]
        lexicon = self.get_lexicon(docs)
        clipped = [list(self.clip(doc, lexicon)) for doc in docs]
        return pad_sequences(clipped, maxlen=self.doclen)


In [139]:
import os
import time
import numpy as np

from functools import wraps

from keras.models import load_model, Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, Embedding


N_FEATURES = 10000
DOC_LEN = 7619
N_CLASSES = 3

def build_lstm():
    lstm = Sequential()
    lstm.add(Embedding(input_dim=N_FEATURES + 1, output_dim=128))
    lstm.add(Dropout(0.4))
    lstm.add(LSTM(units=200, recurrent_dropout=0.2, dropout=0.2))
    lstm.add(Dropout(0.2))
    lstm.add(Dense(N_CLASSES, activation='softmax')) # softmax for mult-class classification
    lstm.compile(
        loss='categorical_crossentropy', # target vals are 0, 1 or 2
        optimizer='adam',
        metrics=['accuracy']
    )
    return lstm

In [140]:
import joblib
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import cross_val_score
from tensorflow.keras.utils import to_categorical

# define the target label mapping
label_mapping = {"No Change": 0, "Increase": 1, "Decrease": 2}

def documents(df):
    texts = df['extracted_text'].values
    return texts

def binarize(df):
    labels = to_categorical(df['Direction'].map(label_mapping).values, num_classes=N_CLASSES)
    return labels

def train_model(df, model, saveto=None, cv=12, **kwargs):
    """
    Trains model from corpus at specified path; fitting the model on the full data and writing it to disk at the saveto directory if specified.
    Returns the scores.
    """
    # Load the corpus data and labels for classification
    X = documents(df)
    print("Shape after tokenization:", len(X))
    
    y = binarize(df)

    # Compute cross validation scores
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')

    # Fit the model on entire data set
    model.fit(X, y)

    # Write to disk if specified
    if saveto:
        # have to save the keras part using keras' save method
        model.steps[-1][1].model.save(saveto['keras_model'])
        model.steps.pop(-1)
        # ... and use joblib to save the rest of the pipeline
        joblib.dump(model, saveto['sklearn_pipe'])

    # Return scores as well as training time via decorator
    return scores

In [141]:
from sklearn.model_selection import train_test_split

X = joined['extracted_text']
y = joined['Direction']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [142]:
class DebugTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        print("Data Shape Before LSTM:", X.shape)
        print("Sample Data Before LSTM:", X[0])  # Print a sample
        return X

In [143]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from scikeras.wrappers import KerasClassifier

pipeline = Pipeline([
    ('keyphrases', KeyphraseExtractor(nfeatures=N_FEATURES,
                                      doclen=DOC_LEN)),
    ('debug', DebugTransformer()),
    ('lstm', KerasClassifier(model=build_lstm,
                           epochs=4,
                           batch_size=128))
])

scores, delta = train_model(joined_test, pipeline, cv=4)

print('Mean score: {}'.format(np.mean(scores)))
print('Total fit time: {:0.2f} seconds'.format(delta))

Shape after tokenization: 100
Keyphrases: ['hosted', 'young student', 'read', 'was', 'first book written']
Frequency counts: [('is', 2020), ('euro area', 950), ('monetary policy', 867), ('be', 731), ('have', 482)]
Data Shape Before LSTM: (75, 7619)
Sample Data Before LSTM: [   0    0    0 ...   19  361 2126]
Keyphrases: ['european central', 'are here', 'report', 'european central', 'start']
Frequency counts: [('is', 2035), ('euro area', 930), ('monetary policy', 834), ('be', 780), ('have', 540)]
Data Shape Before LSTM: (75, 7619)
Sample Data Before LSTM: [   0    0    0 ... 1034 1034 1035]
Keyphrases: ['european central', 'are here', 'report', 'european central', 'start']
Frequency counts: [('is', 1959), ('euro area', 969), ('monetary policy', 860), ('be', 704), ('have', 518)]
Data Shape Before LSTM: (75, 7619)
Sample Data Before LSTM: [   0    0    0 ...  768  768 1225]
Keyphrases: ['european central', 'are here', 'report', 'european central', 'start']
Frequency counts: [('is', 2107),

ValueError: 
All the 4 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/opt/miniconda3/lib/python3.12/site-packages/scikeras/wrappers.py", line 1501, in fit
    super().fit(X=X, y=y, sample_weight=sample_weight, **kwargs)
  File "/opt/miniconda3/lib/python3.12/site-packages/scikeras/wrappers.py", line 770, in fit
    self._fit(
  File "/opt/miniconda3/lib/python3.12/site-packages/scikeras/wrappers.py", line 936, in _fit
    self._check_model_compatibility(y)
  File "/opt/miniconda3/lib/python3.12/site-packages/scikeras/wrappers.py", line 559, in _check_model_compatibility
    if self.n_outputs_expected_ != len(self.model_.outputs):
                                       ^^^^^^^^^^^^^^^^^^^
  File "/opt/miniconda3/lib/python3.12/site-packages/keras/src/models/sequential.py", line 300, in outputs
    raise AttributeError(
AttributeError: Sequential model 'sequential_40' has no defined outputs yet.

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/opt/miniconda3/lib/python3.12/site-packages/scikeras/wrappers.py", line 1501, in fit
    super().fit(X=X, y=y, sample_weight=sample_weight, **kwargs)
  File "/opt/miniconda3/lib/python3.12/site-packages/scikeras/wrappers.py", line 770, in fit
    self._fit(
  File "/opt/miniconda3/lib/python3.12/site-packages/scikeras/wrappers.py", line 936, in _fit
    self._check_model_compatibility(y)
  File "/opt/miniconda3/lib/python3.12/site-packages/scikeras/wrappers.py", line 559, in _check_model_compatibility
    if self.n_outputs_expected_ != len(self.model_.outputs):
                                       ^^^^^^^^^^^^^^^^^^^
  File "/opt/miniconda3/lib/python3.12/site-packages/keras/src/models/sequential.py", line 300, in outputs
    raise AttributeError(
AttributeError: Sequential model 'sequential_41' has no defined outputs yet.

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/opt/miniconda3/lib/python3.12/site-packages/scikeras/wrappers.py", line 1501, in fit
    super().fit(X=X, y=y, sample_weight=sample_weight, **kwargs)
  File "/opt/miniconda3/lib/python3.12/site-packages/scikeras/wrappers.py", line 770, in fit
    self._fit(
  File "/opt/miniconda3/lib/python3.12/site-packages/scikeras/wrappers.py", line 936, in _fit
    self._check_model_compatibility(y)
  File "/opt/miniconda3/lib/python3.12/site-packages/scikeras/wrappers.py", line 559, in _check_model_compatibility
    if self.n_outputs_expected_ != len(self.model_.outputs):
                                       ^^^^^^^^^^^^^^^^^^^
  File "/opt/miniconda3/lib/python3.12/site-packages/keras/src/models/sequential.py", line 300, in outputs
    raise AttributeError(
AttributeError: Sequential model 'sequential_42' has no defined outputs yet.

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/opt/miniconda3/lib/python3.12/site-packages/scikeras/wrappers.py", line 1501, in fit
    super().fit(X=X, y=y, sample_weight=sample_weight, **kwargs)
  File "/opt/miniconda3/lib/python3.12/site-packages/scikeras/wrappers.py", line 770, in fit
    self._fit(
  File "/opt/miniconda3/lib/python3.12/site-packages/scikeras/wrappers.py", line 936, in _fit
    self._check_model_compatibility(y)
  File "/opt/miniconda3/lib/python3.12/site-packages/scikeras/wrappers.py", line 559, in _check_model_compatibility
    if self.n_outputs_expected_ != len(self.model_.outputs):
                                       ^^^^^^^^^^^^^^^^^^^
  File "/opt/miniconda3/lib/python3.12/site-packages/keras/src/models/sequential.py", line 300, in outputs
    raise AttributeError(
AttributeError: Sequential model 'sequential_43' has no defined outputs yet.
