In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding




In [2]:
# Load the data
train_data = pd.read_csv('C:/Users/crist/Desktop/IS450/train.csv', encoding = 'latin1')
test_data = pd.read_csv('C:/Users/crist/Desktop/IS450/test.csv', encoding = 'latin1')

In [6]:
# Preprocessing
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

tokenizer_article = Tokenizer(oov_token='<OOV>')
tokenizer_event = Tokenizer(oov_token='<OOV>')

# Remove stop words
def remove_stop_words(text):
    tokens = text.split()
    tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(tokens)

train_data['article'] = train_data['article'].apply(remove_stop_words)
# Fill missing values with an empty string
train_data['event_description'] = train_data['event_description'].fillna('')

# Apply the remove_stop_words function
train_data['event_description'] = train_data['event_description'].apply(remove_stop_words)

#train_data['event_description'] = train_data['event_description'].apply(remove_stop_words)
test_data['article'] = test_data['article'].apply(remove_stop_words)
#test_data['event_description'] = test_data['event_description'].apply(remove_stop_words)

tokenizer_article.fit_on_texts(train_data['article'])
tokenizer_event.fit_on_texts(train_data['event_description'])

vocab_size_article = len(tokenizer_article.word_index) + 1
vocab_size_event = len(tokenizer_event.word_index) + 1

max_article_length = max([len(article.split()) for article in train_data['article']])
max_event_length = max([len(event.split()) for event in train_data['event_description']])

# Check tokenized word indices for articles
print("Word indices for articles:")
print(tokenizer_article.word_index)

# Check tokenized word indices for event descriptions
print("\nWord indices for event descriptions:")
print(tokenizer_event.word_index)

Word indices for articles:

Word indices for event descriptions:
{'<OOV>': 1, 'market': 2, 'cisco': 3, 'growth': 4, 'ai': 5, 'systems': 6, '2023': 7, 'year': 8, 'billion': 9, 'price': 10, 'inc': 11, 'company': 12, 'stock': 13, 'cloud': 14, '0': 15, '1': 16, 'revenue': 17, '2024': 18, 'global': 19, 'average': 20, 'security': 21, 'industry': 22, 'based': 23, 'services': 24, 'period': 25, '5': 26, 'data': 27, 'companies': 28, '4': 29, 'nvidia': 30, 'forecast': 31, 'new': 32, 'software': 33, 'technology': 34, 'options': 35, '3': 36, 'largest': 37, 'business': 38, 'last': 39, 'expected': 40, 'announced': 41, 'solutions': 42, 'dow': 43, '7': 44, 'csco': 45, 'digital': 46, 's': 47, 'earnings': 48, 'p': 49, 'cagr': 50, 'infrastructure': 51, 'trading': 52, '8': 53, 'service': 54, '2': 55, 'america': 56, 'ratio': 57, 'sales': 58, 'interest': 59, 'total': 60, 'management': 61, 'higher': 62, 'quarter': 63, 'potential': 64, 'years': 65, '2022': 66, 'networking': 67, '9': 68, '6': 69, 'million': 70,

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\crist\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
print(test_data.columns)


Index(['article', 'event_description'], dtype='object')


In [8]:
#Prepare data for training
def prepare_data(data, tokenizer_article, tokenizer_event, max_article_length, max_event_length):
    encoder_input_data = tokenizer_article.texts_to_sequences(data['article'])
    decoder_input_data = tokenizer_event.texts_to_sequences(data['event_description'])

    encoder_input_data = pad_sequences(encoder_input_data, maxlen=max_article_length, padding='post')
    decoder_input_data = pad_sequences(decoder_input_data, maxlen=max_event_length, padding='post')

    decoder_target_data = np.zeros((len(data), max_event_length, vocab_size_event), dtype='float32')
    for i, seq in enumerate(decoder_input_data):
        for j, token_id in enumerate(seq):
            if j > 0:
                decoder_target_data[i][j-1][token_id] = 1.0

    return encoder_input_data, decoder_input_data, decoder_target_data

encoder_input_train, decoder_input_train, decoder_target_train = prepare_data(train_data, tokenizer_article, tokenizer_event, max_article_length, max_event_length)
encoder_input_test, decoder_input_test, decoder_target_test = prepare_data(test_data, tokenizer_article, tokenizer_event, max_article_length, max_event_length)


In [9]:
# Define the model
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size_article, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size_event, latent_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_event, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)




In [10]:
# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([encoder_input_train, decoder_input_train], decoder_target_train, batch_size=64, epochs=5, validation_split=0.2)


Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x2c17fc16ce0>

In [None]:
# Evaluate the model
score = model.evaluate([encoder_input_test, decoder_input_test], decoder_target_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [13]:
# Generate predictions on the test data
decoder_target_pred = model.predict([encoder_input_test, decoder_input_test])

# Convert predictions to text
def sequences_to_texts(sequences, tokenizer):
    texts = []
    for sequence in sequences:
        text = tokenizer.sequences_to_texts([sequence])[0]
        texts.append(text)
    return texts

decoder_target_pred_texts = sequences_to_texts(np.argmax(decoder_target_pred, axis=2), tokenizer_event)


#def preprocess_text1(text):
    # Convert text to lowercase
    #text = text.lower()
    # Remove punctuation
    # text = ''.join([char for char in text if char not in string.punctuation])
    # Remove stop words
    #text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    #return text

#for i in range(len(encoder_input_test)):
    #print("Article:", test_data['article'].iloc[i])
    #print("Predicted Event Description:", decoder_target_pred_texts[i])
    #print("Actual Event Description (without tokenization and stop words removed):")
    #print(preprocess_text1(test_data['event_description'].iloc[i]))
    #print("\n")

# Analyze predicted words and extract sentences containing them
predicted_sentences = []
for i, text in enumerate(decoder_target_pred_texts):
    predicted_word = text.split()[0]  # Extract the first word from the predicted text
    article_sentences = test_data['article'].iloc[i].split('.')  # Split the article into sentences
    # Find sentences containing the predicted word
    relevant_sentences = [sentence.strip() for sentence in article_sentences if predicted_word in sentence]
    predicted_sentences.append(relevant_sentences)

# Print predicted sentences
print("Predicted Sentences:")
for i, sentences in enumerate(predicted_sentences):
    print("Article:", test_data['article'].iloc[i])
    print("Predicted Sentences:")
    for sentence in sentences:
        print(sentence)
    print("Actual Event Description:")
    print(test_data['event_description'].iloc[i])
    print("\n")


Predicted Sentences:
Article: Pioneering tech expert former Apple Inc. executive Saori Casey reportedly set assume role chief financial officer smart speaker manufacturer Sonos Inc. SONO, marking another significant executive departure Apple. Happened: Casey succeed Eddie Lazarus CFO Jan. 22. Meanwhile, Lazarus shift gears become chief strategy officer continuing role chief legal officer, according statement made developer manufacturer audio products. Casey impressive track record Apple, helmed vice president finance position decade. managed financial planning, forecasting, investor relations Apple, reported Bloomberg. Casey's transition comes Sonos venturing new markets headphones, following slowdown primary smart audio equipment sector. Matters: Casey's move Sonos latest series executive departures Apple. Tang Tan, Apple iPhone design executive, set leave tech giant February join groundbreaking AI project. Previously reported Tan collaborate Jony Ive, founder design studio LoveFrom, 