In [4]:
!pip install tensorflow --user

Collecting tensorflow




  Using cached tensorflow-2.16.1-cp39-cp39-win_amd64.whl (2.1 kB)
Collecting tensorflow-intel==2.16.1
  Using cached tensorflow_intel-2.16.1-cp39-cp39-win_amd64.whl (376.9 MB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3
  Using cached protobuf-4.25.3-cp39-cp39-win_amd64.whl (413 kB)
Collecting grpcio<2.0,>=1.24.3
  Using cached grpcio-1.62.1-cp39-cp39-win_amd64.whl (3.8 MB)
Collecting keras>=3.0.0
  Using cached keras-3.1.1-py3-none-any.whl (1.1 MB)
Collecting astunparse>=1.6.0
  Using cached astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting h5py>=3.10.0
  Using cached h5py-3.10.0-cp39-cp39-win_amd64.whl (2.7 MB)
Collecting tensorboard<2.17,>=2.16
  Using cached tensorboard-2.16.2-py3-none-any.whl (5.5 MB)
Collecting google-pasta>=0.1.1
  Using cached google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting termcolor>=1.1.0
  Using cached termcolor-2.4.0-py3-none-any.whl (7.7 kB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1
  Usi

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding



In [2]:
# Load the data
train_data = pd.read_csv("C:/Users/user/OneDrive/Desktop/train.csv", encoding = 'latin1')
test_data = pd.read_csv('C:/Users/user/OneDrive/Desktop/test.csv', encoding = 'latin1')

In [3]:
# Preprocessing
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

tokenizer_article = Tokenizer(oov_token='<OOV>')
tokenizer_event = Tokenizer(oov_token='<OOV>')

# Remove stop words
def remove_stop_words(text):
    tokens = text.split()
    tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(tokens)

train_data['article'] = train_data['article'].apply(remove_stop_words)
# Fill missing values with an empty string
train_data['event_description'] = train_data['event_description'].fillna('')

# Apply the remove_stop_words function
train_data['event_description'] = train_data['event_description'].apply(remove_stop_words)
test_data['article'] = test_data['article'].apply(remove_stop_words)

tokenizer_article.fit_on_texts(train_data['article'])
tokenizer_event.fit_on_texts(train_data['event_description'])

vocab_size_article = len(tokenizer_article.word_index) + 1
vocab_size_event = len(tokenizer_event.word_index) + 1

max_article_length = max([len(article.split()) for article in train_data['article']])
max_event_length = max([len(event.split()) for event in train_data['event_description']])

# Check tokenized word indices for articles
print("Word indices for articles:")
print(tokenizer_article.word_index)

# Check tokenized word indices for event descriptions
print("\nWord indices for event descriptions:")
print(tokenizer_event.word_index)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Word indices for articles:

Word indices for event descriptions:
{'<OOV>': 1, 'market': 2, 'cisco': 3, 'growth': 4, 'ai': 5, 'systems': 6, '2023': 7, 'year': 8, 'billion': 9, 'price': 10, 'inc': 11, 'company': 12, 'stock': 13, 'cloud': 14, '0': 15, '1': 16, 'revenue': 17, '2024': 18, 'global': 19, 'average': 20, 'security': 21, 'industry': 22, 'based': 23, 'services': 24, 'period': 25, '5': 26, 'data': 27, 'companies': 28, '4': 29, 'nvidia': 30, 'forecast': 31, 'new': 32, 'software': 33, 'technology': 34, 'options': 35, '3': 36, 'largest': 37, 'business': 38, 'last': 39, 'expected': 40, 'announced': 41, 'solutions': 42, 'dow': 43, '7': 44, 'csco': 45, 'digital': 46, 's': 47, 'earnings': 48, 'p': 49, 'cagr': 50, 'infrastructure': 51, 'trading': 52, '8': 53, 'service': 54, '2': 55, 'america': 56, 'ratio': 57, 'sales': 58, 'interest': 59, 'total': 60, 'management': 61, 'higher': 62, 'quarter': 63, 'potential': 64, 'years': 65, '2022': 66, 'networking': 67, '9': 68, '6': 69, 'million': 70,

In [4]:
print(test_data.columns)


Index(['article', 'event_description'], dtype='object')


In [5]:
#Prepare data for training
def prepare_data(data, tokenizer_article, tokenizer_event, max_article_length, max_event_length):
    encoder_input_data = tokenizer_article.texts_to_sequences(data['article'])
    decoder_input_data = tokenizer_event.texts_to_sequences(data['event_description'])

    encoder_input_data = pad_sequences(encoder_input_data, maxlen=max_article_length, padding='post')
    decoder_input_data = pad_sequences(decoder_input_data, maxlen=max_event_length, padding='post')

    decoder_target_data = np.zeros((len(data), max_event_length, vocab_size_event), dtype='float32')
    for i, seq in enumerate(decoder_input_data):
        for j, token_id in enumerate(seq):
            if j > 0:
                decoder_target_data[i][j-1][token_id] = 1.0

    return encoder_input_data, decoder_input_data, decoder_target_data

encoder_input_train, decoder_input_train, decoder_target_train = prepare_data(train_data, tokenizer_article, tokenizer_event, max_article_length, max_event_length)
encoder_input_test, decoder_input_test, decoder_target_test = prepare_data(test_data, tokenizer_article, tokenizer_event, max_article_length, max_event_length)


In [6]:
# Define the model
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size_article, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size_event, latent_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_event, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [7]:
# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([encoder_input_train, decoder_input_train], decoder_target_train, batch_size=64, epochs=5, validation_split=0.2)

Epoch 1/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 9s/step - accuracy: 2.8716e-04 - loss: 8.0283 - val_accuracy: 0.0022 - val_loss: 8.0274
Epoch 2/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 7s/step - accuracy: 0.0051 - loss: 8.0247 - val_accuracy: 0.0078 - val_loss: 8.0251
Epoch 3/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 8s/step - accuracy: 0.0120 - loss: 8.0210 - val_accuracy: 0.0072 - val_loss: 8.0216
Epoch 4/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 9s/step - accuracy: 0.0097 - loss: 8.0130 - val_accuracy: 0.0056 - val_loss: 8.0095
Epoch 5/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 9s/step - accuracy: 0.0066 - loss: 7.9863 - val_accuracy: 0.0050 - val_loss: 7.9163


<keras.src.callbacks.history.History at 0x1d42646bb50>

In [8]:
# Generate predictions on the test data
decoder_target_pred = model.predict([encoder_input_test, decoder_input_test])

# Convert predictions to text
def sequences_to_texts(sequences, tokenizer):
    texts = []
    for sequence in sequences:
        text = tokenizer.sequences_to_texts([sequence])[0]
        texts.append(text)
    return texts

decoder_target_pred_texts = sequences_to_texts(np.argmax(decoder_target_pred, axis=2), tokenizer_event)

# Analyze predicted words and extract sentences containing them
predicted_sentences = []
for i, text in enumerate(decoder_target_pred_texts):
    predicted_word = text.split()[0]  # Extract the first word from the predicted text
    article_sentences = test_data['article'].iloc[i].split('.')  # Split the article into sentences
    # Find sentences containing the predicted word
    relevant_sentences = [sentence.strip() for sentence in article_sentences if predicted_word in sentence]
    predicted_sentences.append(relevant_sentences)

# Print predicted sentences
print("Predicted Sentences:")
for i, sentences in enumerate(predicted_sentences):
    print("Article:", test_data['article'].iloc[i])
    print("Predicted Sentences:")
    for sentence in sentences:
        print(sentence)
    print("Actual Event Description:")
    print(test_data['event_description'].iloc[i])
    print("\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Predicted Sentences:
Article: Pioneering tech expert former Apple Inc. executive Saori Casey reportedly set assume role chief financial officer smart speaker manufacturer Sonos Inc. SONO, marking another significant executive departure Apple. Happened: Casey succeed Eddie Lazarus CFO Jan. 22. Meanwhile, Lazarus shift gears become chief strategy officer continuing role chief legal officer, according statement made developer manufacturer audio products. Casey impressive track record Apple, helmed vice president finance position decade. managed financial planning, forecasting, investor relations Apple, reported Bloomberg. Casey's transition comes Sonos venturing new markets headphones, following slowdown primary smart audio equipment sector. Matters: Casey's move Sonos latest series executive departures Apple. Tang Tan, Apple iPhone design executive, set leave tech giant February join groundbreaking AI project. Previous