In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hugotk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def preprocess_text(text):
    # Remover caracteres indesejados
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # remover menções de usuários
    text = re.sub(r'https?:\/\/\S+', '', text)  # remover URLs
    text = re.sub(r'[^\w\s]', '', text)  # remover pontuação
    text = re.sub(r'\d+', '', text)  # remover números
    # Normalizar texto
    text = text.lower()
    # Remover stop words
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [4]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
def stemming(data):
    text = [stemmer.stem(word) for word in data]
    return data


In [5]:
tweets_df = pd.read_excel('base_de_dados_dogecoin.xlsx')

In [6]:
tweets_df['Tweet'] = tweets_df['Tweet'].apply(preprocess_text)
tweets_df

Unnamed: 0,Date,User,Tweet
0,2023-04-04 13:56:06,TickerTop,bitcoin price h ethereum price h tether price ...
1,2023-04-04 13:56:06,drippymickeynft,sudden everyone loves doge coin elon ur genius
2,2023-04-04 13:56:05,giynanta,blue bird twitter became dogecoin
3,2023-04-04 13:56:05,shnbrxxy,true ufo drone link bio dogecoin ufos ufotwitt...
4,2023-04-04 13:56:02,vijuuu1317,need explain something try getting real instea...
...,...,...,...
495,2023-04-04 13:13:39,NFTandDOGE,ive steak eggs past week cured seasonal depres...
496,2023-04-04 13:13:39,37Raider,twitter web logo changes dogecoin cryptocurren...
497,2023-04-04 13:13:25,hiwakurdstani,babybitcoin bought dogecoin bitcoin
498,2023-04-04 13:13:19,LekhakAnurag,dogecoin tips


In [7]:
tweets_df['Tweet'] = tweets_df['Tweet'].apply(lambda w: stemming(w))
tweets_df

Unnamed: 0,Date,User,Tweet
0,2023-04-04 13:56:06,TickerTop,bitcoin price h ethereum price h tether price ...
1,2023-04-04 13:56:06,drippymickeynft,sudden everyone loves doge coin elon ur genius
2,2023-04-04 13:56:05,giynanta,blue bird twitter became dogecoin
3,2023-04-04 13:56:05,shnbrxxy,true ufo drone link bio dogecoin ufos ufotwitt...
4,2023-04-04 13:56:02,vijuuu1317,need explain something try getting real instea...
...,...,...,...
495,2023-04-04 13:13:39,NFTandDOGE,ive steak eggs past week cured seasonal depres...
496,2023-04-04 13:13:39,37Raider,twitter web logo changes dogecoin cryptocurren...
497,2023-04-04 13:13:25,hiwakurdstani,babybitcoin bought dogecoin bitcoin
498,2023-04-04 13:13:19,LekhakAnurag,dogecoin tips


In [8]:
sia = SentimentIntensityAnalyzer()
polarities = []
for tweet in tweets_df.Tweet:
   polarity = sia.polarity_scores(tweet)['compound']
   polarities.append(polarity)
polarities

[0.0,
 0.5719,
 0.0,
 0.4215,
 0.6124,
 0.296,
 0.0,
 0.25,
 -0.5719,
 0.4404,
 0.5994,
 0.6486,
 0.34,
 0.0,
 0.4019,
 0.4019,
 0.0,
 0.0,
 -0.1779,
 0.0258,
 0.0,
 0.0,
 0.0,
 0.4404,
 -0.34,
 0.2449,
 0.9217,
 0.0,
 -0.8126,
 -0.4939,
 0.0,
 0.0,
 0.25,
 0.34,
 -0.5267,
 0.5859,
 0.0,
 0.0,
 0.0,
 0.0,
 0.296,
 0.4449,
 0.0,
 -0.5574,
 0.0,
 0.0,
 0.0,
 0.296,
 0.0,
 -0.7184,
 -0.2732,
 -0.0516,
 0.0,
 0.0,
 0.0,
 0.6369,
 -0.0258,
 0.0,
 0.0,
 0.0,
 0.296,
 0.296,
 0.8807,
 0.6486,
 -0.8885,
 0.0,
 -0.7184,
 0.6249,
 0.0,
 0.4404,
 0.5859,
 0.2023,
 0.3612,
 0.7717,
 0.0772,
 0.0,
 0.34,
 0.0,
 0.0,
 0.5574,
 0.4019,
 -0.8885,
 0.0,
 0.5277,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.5574,
 0.5574,
 0.7845,
 0.34,
 0.296,
 0.0,
 0.0,
 0.0,
 0.5267,
 0.6249,
 -0.3182,
 0.7245,
 0.128,
 0.0,
 0.0,
 0.6369,
 0.9081,
 0.0,
 0.296,
 0.765,
 -0.0772,
 0.0,
 -0.6808,
 0.3818,
 0.296,
 0.128,
 0.3818,
 -0.5719,
 0.1779,
 -0.25,
 0.2023,
 0.0,
 0.0,
 0.9081,
 0.0258,
 0.836,
 0.0,
 0.7351,
 0.2732,
 

In [None]:
# from textblob import TextBlob

# def polarity_textblob(text):
#    return TextBlob(text).sentiment.polarity

In [None]:
# textblob_polarities = []

# for tweet in tweets_df['Tweet']:
#     pola = TextBlob(tweet).sentiment.polarity
#     textblob_polarities.append(pola)

# textblob_polarities

In [9]:
tweets_df['Polarity'] = polarities

tweets_df

Unnamed: 0,Date,User,Tweet,Polarity
0,2023-04-04 13:56:06,TickerTop,bitcoin price h ethereum price h tether price ...,0.0000
1,2023-04-04 13:56:06,drippymickeynft,sudden everyone loves doge coin elon ur genius,0.5719
2,2023-04-04 13:56:05,giynanta,blue bird twitter became dogecoin,0.0000
3,2023-04-04 13:56:05,shnbrxxy,true ufo drone link bio dogecoin ufos ufotwitt...,0.4215
4,2023-04-04 13:56:02,vijuuu1317,need explain something try getting real instea...,0.6124
...,...,...,...,...
495,2023-04-04 13:13:39,NFTandDOGE,ive steak eggs past week cured seasonal depres...,-0.2263
496,2023-04-04 13:13:39,37Raider,twitter web logo changes dogecoin cryptocurren...,0.0000
497,2023-04-04 13:13:25,hiwakurdstani,babybitcoin bought dogecoin bitcoin,0.0000
498,2023-04-04 13:13:19,LekhakAnurag,dogecoin tips,0.0000


In [10]:
def classify_sentiment(polarity):
    if polarity > 0.2:
        return 'positive'
    elif polarity < -0.2:
        return 'negative'
    else:
        return 'neutral'
    

In [11]:
tweets_df['Sentiment'] = tweets_df['Polarity'].apply(classify_sentiment)
tweets_df

Unnamed: 0,Date,User,Tweet,Polarity,Sentiment
0,2023-04-04 13:56:06,TickerTop,bitcoin price h ethereum price h tether price ...,0.0000,neutral
1,2023-04-04 13:56:06,drippymickeynft,sudden everyone loves doge coin elon ur genius,0.5719,positive
2,2023-04-04 13:56:05,giynanta,blue bird twitter became dogecoin,0.0000,neutral
3,2023-04-04 13:56:05,shnbrxxy,true ufo drone link bio dogecoin ufos ufotwitt...,0.4215,positive
4,2023-04-04 13:56:02,vijuuu1317,need explain something try getting real instea...,0.6124,positive
...,...,...,...,...,...
495,2023-04-04 13:13:39,NFTandDOGE,ive steak eggs past week cured seasonal depres...,-0.2263,negative
496,2023-04-04 13:13:39,37Raider,twitter web logo changes dogecoin cryptocurren...,0.0000,neutral
497,2023-04-04 13:13:25,hiwakurdstani,babybitcoin bought dogecoin bitcoin,0.0000,neutral
498,2023-04-04 13:13:19,LekhakAnurag,dogecoin tips,0.0000,neutral


In [12]:
vectorizer = CountVectorizer()

In [13]:
X = vectorizer.fit_transform(tweets_df['Tweet']).toarray()
print(vectorizer.vocabulary_)
print(X)

{'bitcoin': 223, 'price': 1518, 'ethereum': 669, 'tether': 1972, 'bnb': 233, 'usd': 2118, 'coin': 369, 'dogecoin': 545, 'sudden': 1912, 'everyone': 676, 'loves': 1164, 'doge': 542, 'elon': 623, 'ur': 2114, 'genius': 805, 'blue': 230, 'bird': 218, 'twitter': 2079, 'became': 189, 'true': 2052, 'ufo': 2089, 'drone': 584, 'link': 1117, 'bio': 216, 'ufos': 2090, 'ufotwitter': 2091, 'dogecointothemoon': 551, 'tuesdayvibe': 2068, 'need': 1323, 'explain': 690, 'something': 1824, 'try': 2061, 'getting': 809, 'real': 1607, 'instead': 989, 'describing': 507, 'looks': 1151, 'like': 1111, 'draw': 576, 'explaining': 691, 'sounds': 1835, 'hum': 942, 'everything': 677, 'remove': 1632, 'layers': 1086, 'abstraction': 13, 'mlb': 1265, 'basketball': 181, 'seattle': 1721, 'mariners': 1192, 'beat': 187, 'los': 1154, 'angeles': 88, 'angels': 89, 'join': 1029, 'afribet': 44, 'sports': 1861, 'eth': 668, 'btc': 269, 'crypto': 439, 'nft': 1337, 'gamblingtwitter': 794, 'bbtitans': 184, 'agustd': 52, 'musk': 1297,

In [None]:
vectorizer.vocabulary_

In [None]:
# df_x = pd.DataFrame(X).T
# df_x.to_excel('df_x.xlsx')

In [None]:
# vectorizer_1 = TfidfVectorizer()

In [None]:
# X_1 = vectorizer_1.fit_transform(tweets_df['Tweet']).toarray()
# print(vectorizer_1.vocabulary_)
# print(X_1)

In [None]:
# df_x_1 = pd.DataFrame(X_1).T
# df_x_1.to_excel('df_x_1.xlsx')

In [14]:
# y = np.array(tweets_df['Sentiment'].apply(lambda x: 2 if x =='positive' else(0 if x == 'neutral' else 1)))
y = np.array(tweets_df['Sentiment'].apply(lambda x: 1 if x == 'positive' else 0))
# y = np.array(tweets_df['Polarity'])
y

array([0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1,

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM

model = Sequential()
model.add(Embedding(input_dim=8000, output_dim=32, input_length=X.shape[1]))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

In [17]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
from keras.callbacks import EarlyStopping

es = EarlyStopping(monitor='val_loss', mode='min', verbose=2, patience=4)
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128, callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 10: early stopping


<keras.callbacks.History at 0x1dff48b19d0>

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print('Test accuracy: {}'.format(accuracy))