In [None]:
import nltk                         
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from sklearn.metrics import confusion_matrix, classification_report
import io
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import OneHotEncoder

In [None]:
train = pd.read_csv('/content/drive/MyDrive/train_token.csv', encoding='latin-1')
y_train = train["Sentiment"].map({"Extremely Positive":0, "Positive": 1,"Neutral": 2,"Negative": 3,"Extremely Negative": 4})
y_train = y_train.values
y_train = y_train[~train['TweetTokens'].isnull()]
train = train[~train['TweetTokens'].isnull()]

test = pd.read_csv('/content/drive/MyDrive/test_token.csv', encoding='latin-1')
y_test = test["Sentiment"].map({"Extremely Positive":0, "Positive": 1,"Neutral": 2,"Negative": 3,"Extremely Negative": 4})
y_test = y_test.values
y_test = y_test[~test['TweetTokens'].isnull()]
test = test[~test['TweetTokens'].isnull()]

In [None]:
def parse_data(data):

  sentence = []
  vocabulary = set()

  for i in range(0,len(data)):
    list_of_words = re.sub("[^\w]", " ", data.iloc[i]['TweetTokens']).split()
    sentence.append(list_of_words)
    vocabulary = vocabulary.union(set(list_of_words))

  return sentence, vocabulary

In [None]:
def fit_tokenizer(sentences):
    """
    Instantiates the Tokenizer class
    Args:
        sentences (list): lower-cased sentences without stopwords
    Returns:
        tokenizer (object): an instance of the Tokenizer class containing the word-index dictionary
    """
    tokenizer = Tokenizer(oov_token = '<OOV>', num_words=10000)
    tokenizer.fit_on_texts(sentences)

    return tokenizer

In [None]:
def get_padded_sequences(tokenizer, sentences):
    """
    Generates an array of token sequences and pads them to the same length
    Args:
        tokenizer (object): Tokenizer instance containing the word-index dictionary
        sentences (list of string): list of sentences to tokenize and pad
    Returns:
        padded_sequences (array of int): tokenized sentences padded to the same length
    """
    
    sequences = tokenizer.texts_to_sequences(sentences)
    padded_sequences = pad_sequences(sequences, padding = 'post', maxlen=39)
    
    return padded_sequences

In [None]:
sentence_train, vocabulary_train = parse_data(train)

In [None]:
tokenizer = fit_tokenizer(sentence_train)
word_index = tokenizer.word_index

In [None]:
padded_sequence_train = get_padded_sequences(tokenizer, sentence_train)

In [None]:
train.iloc[5]['TweetTokens']

"['cashier', 'groceri', 'store', 'share', 'insight', 'covid', 'prove', 'credibl', 'comment', 'civic', 'class', 'know', 'talk']"

In [None]:
padded_sequence_train[5]

array([ 548,    7,    4,  157,  506,    2, 1404, 4581, 1248, 5959,  959,
         65,  320,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0], dtype=int32)

In [None]:
sentence_test, vocabulary_test = parse_data(test)
padded_sequence_test = get_padded_sequences(tokenizer, sentence_test)

In [None]:
def y_one_hot(y):

  onehot_encoder = OneHotEncoder(sparse=False)
  y_onehot = y.reshape(len(y), 1)
  y_onehot = onehot_encoder.fit_transform(y_onehot)

  return y_onehot

In [None]:
y_train_onehot = y_one_hot(y_train)
y_test_onehot = y_one_hot(y_test)

In [None]:
#Adding an early stopping
es = EarlyStopping(monitor='val_accuracy', mode='max', patience=3, restore_best_weights=True)

In [None]:
# Parameters
vocab_size = len(vocabulary_train)
max_length = 39
embedding_dim = 100
oov_tok = "<OOV>"

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),

    # Global Average Pooling
    #tf.keras.layers.GlobalAveragePooling1D(),

    # Conv1D
    #tf.keras.layers.Conv1D(128,5),
    #tf.keras.layers.Flatten(),

    # LSTM
    #tf.keras.layers.LSTM(128),

    # Bidirectional LSTM
    #tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),

    # GRU
    #tf.keras.layers.GRU(32),

    # Bidirectional LSTM
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128)),

    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

# Setup the training parameters
model.compile(loss='categorical_crossentropy',optimizer='Adam',metrics=['accuracy'])

# Print the model summary
model.summary()

Model: "sequential_30"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_34 (Embedding)    (None, 39, 100)           3216100   
                                                                 
 bidirectional_5 (Bidirectio  (None, 256)              176640    
 nal)                                                            
                                                                 
 dropout_29 (Dropout)        (None, 256)               0         
                                                                 
 dense_55 (Dense)            (None, 6)                 1542      
                                                                 
 dense_56 (Dense)            (None, 5)                 35        
                                                                 
Total params: 3,394,317
Trainable params: 3,394,317
Non-trainable params: 0
___________________________________________

In [None]:
num_epochs = 5
# Train the model
model.fit(padded_sequence_train, y_train_onehot, epochs=num_epochs, validation_data=(padded_sequence_test, y_test_onehot), callbacks=[es])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f05f78c97d0>

In [None]:
# prediction
y_pred_test = model.predict(padded_sequence_test)
y_pred_test = np.argmax(y_pred_test,axis=1)

y_pred_train = model.predict(padded_sequence_train)
y_pred_train = np.argmax(y_pred_train,axis=1)

In [None]:
m_train = confusion_matrix(y_train,y_pred_train)
print('Tacnost na trening podacima: ')
print(sum(m_train[i][i] for i in range(len(m_train))) / sum(sum(m_train))*100)

m = confusion_matrix(y_test,y_pred_test)
print('Tacnost na test podacima: ')
print(sum(m[i][i] for i in range(len(m))) / sum(sum(m))*100)
print('Konfuziona matrica na test podacima: ')
m

Tacnost na trening podacima: 
80.09922420292322
Tacnost na test podacima: 
68.95915678524375
Konfuziona matrica na test podacima: 


array([[410, 160,   1,  27,   1],
       [ 86, 689,  41, 114,  17],
       [  4,  81, 443,  78,  10],
       [ 10, 192,  55, 646, 138],
       [  0,  17,   6, 140, 429]])

In [None]:
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.80      0.69      0.74       599
           1       0.64      0.72      0.68       947
           2       0.78      0.73      0.76       616
           3       0.65      0.69      0.67      1041
           4       0.77      0.69      0.73       592

    accuracy                           0.70      3795
   macro avg       0.73      0.71      0.71      3795
weighted avg       0.71      0.70      0.71      3795



In [None]:
missclassified = test[y_test!=y_pred_test]

In [None]:
missclassified.to_csv('/content/drive/MyDrive/missclassified.csv',index = False)

In [None]:
#indeks = 20

#print(missclassified.iloc[indeks]['OriginalTweet'])
#print("")

#print(f"Sentiment {missclassified.iloc[indeks]['Sentiment']}")
#print(f"Prediction {y_pred[missclassified.iloc[indeks]['Unnamed: 0']]}")

In [None]:
#embedding_layer = model.layers[0]
#embedding_weights = embedding_layer.get_weights()[0]
#reverse_word_index = tokenizer.index_word

In [None]:
#out_v = io.open('/content/drive/MyDrive/vecs.tsv', 'w', encoding='utf-8')
#out_m = io.open('/content/drive/MyDrive/meta.tsv', 'w', encoding='utf-8')

#for word_num in range(1, vocab_size):
#  word_name = reverse_word_index[word_num]
#  word_embedding = embedding_weights[word_num]
#  out_m.write(word_name + "\n")
#  out_v.write('\t'.join([str(x) for x in word_embedding]) + "\n")

#out_v.close()
#out_m.close()