In [None]:
%pip install transformers

In [None]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
# load tweets
PATH = 'elon_musk_tweets.csv'
n_tweets = 50
tweets_df = pd.DataFrame(pd.read_csv(PATH)['text'].sample(frac=1)[:n_tweets]) # shuffle and select n tweets

In [None]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [None]:
# roberta model setup
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

In [None]:
# preprocess and encode tweets
preprocessed_tweets = []
encoded_tweets = []

for tweet in tweets_df['text']:
    preprocessed = (preprocess(tweet))
    encoded = tokenizer(preprocessed, return_tensors='pt')
    preprocessed_tweets.append(preprocessed)
    encoded_tweets.append(encoded)

tweets_df['preprocessed'] = preprocessed_tweets
tweets_df['encoded'] = encoded_tweets
tweets_df.reset_index(inplace=True, drop=True)

In [None]:
tweets_analysis = []
for item in tweets_df.encoded:

    output = model(**item)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    tweets_analysis.append(scores)

In [None]:
tweets_df = pd.concat([tweets_df, pd.DataFrame(tweets_analysis)], axis = 1)
tweets_df = tweets_df.rename(columns={0: config.id2label[0], 1: config.id2label[1],2: config.id2label[2]})


In [None]:
tweets_df['sentiment'] = tweets_df[['negative','positive', 'neutral']].idxmax(axis=1)


In [None]:
tweets_df

In [None]:
# Load the data in a dataframe
pd.set_option('max_colwidth', None)
pd.set_option('display.width', 3000)
 
# Show a tweet for each sentiment
display(tweets_df[tweets_df["sentiment"] == 'positive'].head(1))
display(tweets_df[tweets_df["sentiment"] == 'neutral'].head(1))
display(tweets_df[tweets_df["sentiment"] == 'negative'].head(1))

In [None]:
sentiment_counts = tweets_df.groupby(['sentiment']).size()
print(sentiment_counts)

In [None]:
fig = plt.figure(figsize=(6,6), dpi=100)
ax = plt.subplot(111)
sentiment_counts.plot.pie(ax=ax, autopct='%1.1f%%', startangle=270, fontsize=12, label="")

In [None]:
# Wordcloud with positive tweets
positive_tweets = tweets_df['text'][tweets_df["sentiment"] == 'positive']
stop_words = ["https", "co", "RT"] + list(STOPWORDS)
positive_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(positive_tweets))
plt.figure()
plt.title("Positive Tweets - Wordcloud")
plt.imshow(positive_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()


In [None]:
# Wordcloud with negative tweets
negative_tweets = tweets_df['text'][tweets_df["sentiment"] == 'negative']
stop_words = ["https", "co", "RT"] + list(STOPWORDS)
negative_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(negative_tweets))
plt.figure()
plt.title("Negative Tweets - Wordcloud")
plt.imshow(negative_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()