In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import keras.utils

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\estag\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
Tweets = pd.read_csv("Tweets2.csv")
Tweets.shape

(74682, 4)

In [4]:
Tweets.head()

Unnamed: 0,id,local,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [5]:
Tweets.groupby(['sentiment']).size()

sentiment
Irrelevant    12990
Negative      22542
Neutral       18318
Positive      20832
dtype: int64

In [6]:
Tweets.loc[Tweets['sentiment']=='Irrelevant','sentiment'] = 'Neutral'

In [7]:
Tweets.groupby(['sentiment']).size()

sentiment
Negative    22542
Neutral     31308
Positive    20832
dtype: int64

In [8]:
Tweets = Tweets.dropna(subset=['text'])
Tweets.reset_index(drop=True, inplace=True)

In [9]:
Tweets.shape

(73996, 4)

## **Supervisionado**

In [10]:
token = Tokenizer(num_words=100)
token.fit_on_texts(Tweets['text'].values)

In [11]:
X = token.texts_to_sequences(Tweets['text'].values)
X = pad_sequences(X, padding='post', maxlen=100)

In [13]:
labelEnc = LabelEncoder()
y = labelEnc.fit_transform(Tweets['sentiment'])
print(y)

[2 2 2 ... 2 2 2]


In [14]:
y = keras.utils.to_categorical(y)
print(y)

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 ...
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
X_test

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 3, 58,  0, ...,  0,  0,  0],
       [ 4,  5,  7, ...,  0,  0,  0],
       ...,
       [19, 66,  0, ...,  0,  0,  0],
       [ 1,  9,  8, ...,  0,  0,  0],
       [ 3, 44,  6, ...,  0,  0,  0]])

In [17]:
model = Sequential()

model.add(Embedding(input_dim=len(token.word_index), output_dim=128, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(units=196, dropout=0.2, recurrent_dropout=0, activation='tanh', recurrent_activation='sigmoid', unroll=False, use_bias=True))
model.add(Dense(units=3, activation='softmax'))



In [18]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

None


In [20]:
model.fit(X_train, y_train, epochs=10, batch_size=300, verbose=True)

Epoch 1/10
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 298ms/step - accuracy: 0.4148 - loss: 1.0842
Epoch 2/10
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 296ms/step - accuracy: 0.4165 - loss: 1.0831
Epoch 3/10
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 291ms/step - accuracy: 0.4157 - loss: 1.0836
Epoch 4/10
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 289ms/step - accuracy: 0.4155 - loss: 1.0835
Epoch 5/10
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 270ms/step - accuracy: 0.4200 - loss: 1.0820
Epoch 6/10
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 270ms/step - accuracy: 0.4180 - loss: 1.0827
Epoch 7/10
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 270ms/step - accuracy: 0.4111 - loss: 1.0850
Epoch 8/10
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 269ms/step - accuracy: 0.4138 - loss: 1.0843
Epoch 9/10
[1m1

<keras.src.callbacks.history.History at 0x1bbb249f260>

In [21]:
_, accuracy = model.evaluate(X_test, y_test)
print("Accuracy:", accuracy)

[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 16ms/step - accuracy: 0.4184 - loss: 1.0830
Accuracy: 0.42251425981521606


## **VADER**

In [22]:
mas = SentimentIntensityAnalyzer()
Tweets['vader_sentiment'] = ''

for y in range(len(Tweets.index)):
	x = mas.polarity_scores(Tweets['text'].iloc[y])
	del x['compound']
	maior = max(x, key=x.get)
	Tweets.loc[y, 'vader_sentiment'] = maior

In [24]:
Tweets.groupby(['vader_sentiment']).size()

vader_sentiment
neg     3660
neu    65581
pos     4755
dtype: int64

In [25]:
Tweets.groupby(['sentiment']).size()

sentiment
Negative    22358
Neutral     30983
Positive    20655
dtype: int64

In [26]:
Tweets.loc[Tweets['vader_sentiment']=='neu', 'vader_sentiment'] = 'Neutral'
Tweets.loc[Tweets['vader_sentiment']=='neg', 'vader_sentiment'] = 'Negative'
Tweets.loc[Tweets['vader_sentiment']=='pos', 'vader_sentiment'] = 'Positive'

In [27]:
Tweets.groupby(['vader_sentiment']).size()


vader_sentiment
Negative     3660
Neutral     65581
Positive     4755
dtype: int64

In [28]:
y_pred = Tweets['vader_sentiment']
y_test = Tweets['sentiment']

In [29]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 2004 19902   452]
 [ 1122 28384  1477]
 [  534 17295  2826]]


In [30]:
accuracyV = accuracy_score(y_test, y_pred)
print(accuracyV)

0.44886210065408944
