In [1]:
pip install tensorflow

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D
from tensorflow.keras.layers import Reshape, Flatten, Dropout, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from sklearn.metrics import f1_score, precision_score, recall_score
import warnings
from preprocessing import (
    preprocessing,
    convert_text_to_vector,
    convert_text_to_label
)
warnings.filterwarnings("ignore")

In [5]:
df_twitter = pd.read_csv('/Users/d.o.npat/Desktop/personal_project/cleaned_twitter_data.csv')
df_guardian = pd.read_csv('/Users/d.o.npat/Desktop/personal_project/cleaned_guardian_df.csv')


inputs_twitter = df_twitter['text'].values.tolist()
labels_twitter = df_twitter['sentiment'].values.tolist()

inputs_guardian = df_guardian['text'].values.tolist()
labels_guardian = df_guardian['sentiment'].values.tolist()

inputs = inputs_twitter + inputs_guardian
labels = labels_twitter + labels_guardian

inputs = list(map(preprocessing, inputs))

data = convert_text_to_vector(inputs)
labels = convert_text_to_label(labels)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(inputs, labels, test_size=0.2, random_state=42)

In [7]:
max_words = 10000
max_sequence_length = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_sequence = tokenizer.texts_to_sequences(X_train)
X_test_sequence = tokenizer.texts_to_sequences(X_test)
X_train_padded = tf.keras.preprocessing.sequence.pad_sequences(X_train_sequence, maxlen=max_sequence_length)
X_test_padded = tf.keras.preprocessing.sequence.pad_sequences(X_test_sequence, maxlen=max_sequence_length)

tokenizer_filename = 'tokenizer.pkl'
with open(tokenizer_filename, 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

In [10]:
embedding_dim = 100
batch_size = 256
epochs = 80
drop = 0.5

In [11]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_sequence_length))
model.add(LSTM(100))
model.add(Dropout(drop))
model.add(Dense(3, activation='softmax'))

adam = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, weight_decay=0.0)
model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
history = model.fit(X_train_padded, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

test_loss, test_accuracy = model.evaluate(X_test_padded, y_test, verbose=1)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

y_test_pred = model.predict(X_test_padded)
y_test_pred_classes = np.argmax(y_test_pred, axis=1)

f1 = f1_score(y_test, y_test_pred_classes, average='weighted')
precision = precision_score(y_test, y_test_pred_classes, average='weighted')
recall = recall_score(y_test, y_test_pred_classes, average='weighted')
print(f'F1 Score: {f1:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')

Epoch 1/80
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 278ms/step - accuracy: 0.4169 - loss: 1.0935 - val_accuracy: 0.6441 - val_loss: 1.0487
Epoch 2/80
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 157ms/step - accuracy: 0.5947 - loss: 1.0465 - val_accuracy: 0.6271 - val_loss: 0.9303
Epoch 3/80
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 209ms/step - accuracy: 0.5667 - loss: 0.9471 - val_accuracy: 0.6271 - val_loss: 0.8360
Epoch 4/80
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 191ms/step - accuracy: 0.5490 - loss: 0.9232 - val_accuracy: 0.6271 - val_loss: 0.8346
Epoch 5/80
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 230ms/step - accuracy: 0.5975 - loss: 0.8821 - val_accuracy: 0.6271 - val_loss: 0.8327
Epoch 6/80
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 217ms/step - accuracy: 0.5995 - loss: 0.8815 - val_accuracy: 0.6271 - val_loss: 0.8128
Epoch 7/80
[1m3/3[0m [32m━━━━━━━━━━━━

In [12]:
filter_sizes = [2,3,5]
num_filters = 32

inputs = Input(shape=(max_sequence_length,), dtype='int32')
embedding = Embedding(max_words, embedding_dim, input_length=max_sequence_length)(inputs)
reshape = Reshape((max_sequence_length,embedding_dim,1))(embedding)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='elu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='elu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='elu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(max_sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(max_sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(max_sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=3, activation='softmax')(dropout)

model = Model(inputs=inputs, outputs=output)

model.summary()

adam = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, weight_decay=0.0)

model.compile(optimizer=adam, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train_padded, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

test_loss, test_accuracy = model.evaluate(X_test_padded, y_test, verbose=1)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

y_test_pred = model.predict(X_test_padded)
y_test_pred_classes = np.argmax(y_test_pred, axis=1)

f1 = f1_score(y_test, y_test_pred_classes, average='weighted')
precision = precision_score(y_test, y_test_pred_classes, average='weighted')
recall = recall_score(y_test, y_test_pred_classes, average='weighted')
print(f'F1 Score: {f1:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')

Epoch 1/80
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 278ms/step - accuracy: 0.3660 - loss: 1.0942 - val_accuracy: 0.6102 - val_loss: 1.0288
Epoch 2/80
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 156ms/step - accuracy: 0.5648 - loss: 1.0221 - val_accuracy: 0.6610 - val_loss: 0.9716
Epoch 3/80
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step - accuracy: 0.6380 - loss: 0.9644 - val_accuracy: 0.6441 - val_loss: 0.9232
Epoch 4/80
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step - accuracy: 0.6332 - loss: 0.9196 - val_accuracy: 0.6271 - val_loss: 0.8824
Epoch 5/80
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step - accuracy: 0.6234 - loss: 0.8857 - val_accuracy: 0.6271 - val_loss: 0.8483
Epoch 6/80
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.6206 - loss: 0.8421 - val_accuracy: 0.6271 - val_loss: 0.8188
Epoch 7/80
[1m3/3[0m [32m━━━━━━━━━━━━