In [None]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords

In [None]:
import subprocess

# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
data = pd.read_csv('/kaggle/input/stockheadings/Data.csv', encoding="ISO-8859-1")

In [None]:
data.head()

In [None]:
x=data.iloc[:, 2:27]
y=data.iloc[:, 1:2]

In [None]:
from nltk.stem import WordNetLemmatizer
lem=WordNetLemmatizer()

In [None]:
def process(x):
    x['headings']=x.apply(lambda x: ' '.join(map(str,x)), axis=1)
    x=x['headings']
    for i in range(len(x)):
        words=re.sub('[^a-zA-Z]', ' ', x.iloc[i])
        words=nltk.word_tokenize(words)
        words=[lem.lemmatize(word.lower()) for word in words if word not in stopwords.words('english')]
        x.iloc[i]=' '.join(words)
    return x

In [None]:
train = data[data['Date'] < '20150101']
test = data[data['Date'] > '20141231']
x_train=train.iloc[:,2:27]
y_train=train.iloc[:,1:2]

x_test=test.iloc[:, 2:27]
y_test=test.iloc[:,1:2]

In [None]:
x_train=process(x_train)
x_test=process(x_test)

In [None]:
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D, Dropout
from tensorflow.keras.models import Sequential

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer=Tokenizer(num_words=3000, oov_token="<oov>")

In [None]:
tokenizer.fit_on_texts(x_train)

In [None]:
vocab=tokenizer.word_index

In [None]:
training_seq=tokenizer.texts_to_sequences(x_train)
padded_training_seq=pad_sequences(training_seq, maxlen=150, truncating='post', padding='post')

In [None]:
test_seq=tokenizer.texts_to_sequences(x_test)
padded_test_seq=pad_sequences(test_seq, maxlen=150, truncating='post', padding='post')

In [None]:
model=Sequential([
    Embedding(input_dim=len(vocab),output_dim=16, input_length=150),
    GlobalAveragePooling1D(),
    Dense(units=50, activation='relu'),
    Dense(units=25, activation='relu'),
    Dense(units=12, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss=tf.keras.losses.BinaryCrossentropy()
)

In [None]:
model.fit(padded_training_seq, y_train, epochs=50, validation_data=(padded_test_seq, y_test), batch_size=16)

In [None]:
lo=pd.DataFrame(model.history.history)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
lo.plot()

In [None]:
y_pred=model.predict(padded_test_seq)

In [None]:
y_pred=(y_pred > 0.5).astype(int)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))