In [None]:
# imports
import tweepy
import time
import requests
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
import pandas as pd
import matplotlib.pyplot as plt
import json
from wordcloud import WordCloud
from wordcloud import STOPWORDS
from nltk.corpus import stopwords



In [None]:
# twitter api auth
bearer_token = None

client = tweepy.Client(bearer_token)

In [None]:
query = '(elon musk) lang:en -is:retweet'

response = client.search_recent_tweets(query = query, max_results=100)

In [None]:
#import pickle
#file = open('elon-musk-tweets', 'wb')
#pickle.dump(response, file)

In [None]:
output = []
for tweet in response.data:
    text = tweet.text
    line = {'text' : text}
    output.append(line)

In [None]:
tweets_df = pd.DataFrame(output)

In [None]:
tokenizer = WordPunctTokenizer()
twitter_handle = r'@[A-Za-z0-9_]+'                         # remove twitter handle (@username)
url_handle = r'http[^ ]+'                                  # remove website URLs that start with 'https?://'
combined_handle = r'|'.join((twitter_handle, url_handle))  # join
www_handle = r'www.[^ ]+'                                  # remove website URLs that start with 'www.'
punctuation_handle = r'\W+'

In [None]:
def process_text(text):
    soup = BeautifulSoup(text, "html.parser")
    souped = soup.get_text()
    stop_words = set(stopwords.words('english'))

    try:
        text = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        text = souped

    cleaned_text = re.sub(punctuation_handle, " ",(re.sub(www_handle, '', re.sub(combined_handle, '', text)).lower()))
    cleaned_text = ' '.join([word for word in cleaned_text.split() if word not in stop_words])

    return (" ".join([word for word in tokenizer.tokenize(cleaned_text) if len(word) > 1])).strip()

In [None]:
cleaned_tweets = []

for tweet in response.data:
    cleaned_tweets.append(process_text(tweet.text))

clean_text = pd.DataFrame({'clean_text' : cleaned_tweets})
tweets_df = pd.concat([tweets_df, clean_text], axis = 1)

#data.sample(5)

In [None]:
model = "cardiffnlp/twitter-roberta-base-sentiment-latest"
hf_token = None 

In [None]:
API_URL = "https://api-inference.huggingface.co/models/" + model
headers = {"Authorization": "Bearer %s" % (hf_token)}

def analysis(data):
    payload = dict(inputs=data, options=dict(wait_for_model=True))
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

In [None]:
#this will take some time to run, usually about 60 seconds

tweets_analysis = []
for tweet in tweets_df.clean_text:
    try:
        sentiment_result = analysis(tweet)[0]
        top_sentiment = max(sentiment_result, key=lambda x: x['score']) # Get the sentiment with the higher score
        tweets_analysis.append({'sentiment': top_sentiment['label']})

    except Exception as e:
        print(e)


In [None]:
tweets_df = pd.concat([tweets_df, pd.DataFrame(tweets_analysis)], axis = 1)

In [None]:
# Load the data in a dataframe
pd.set_option('max_colwidth', None)
pd.set_option('display.width', 3000)
 
# Show a tweet for each sentiment
display(tweets_df[tweets_df["sentiment"] == 'positive'].head(1))
display(tweets_df[tweets_df["sentiment"] == 'neutral'].head(1))
display(tweets_df[tweets_df["sentiment"] == 'negative'].head(1))

In [None]:
sentiment_counts = tweets_df.groupby(['sentiment']).size()
print(sentiment_counts)

In [None]:
fig = plt.figure(figsize=(6,6), dpi=100)
ax = plt.subplot(111)
sentiment_counts.plot.pie(ax=ax, autopct='%1.1f%%', startangle=270, fontsize=12, label="")

In [None]:
# Wordcloud with positive tweets
positive_tweets = tweets_df['clean_text'][tweets_df["sentiment"] == 'positive']
stop_words = ["https", "co", "RT"] + list(STOPWORDS)
positive_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(positive_tweets))
plt.figure()
plt.title("Positive Tweets - Wordcloud")
plt.imshow(positive_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
 
# Wordcloud with negative tweets
negative_tweets = tweets_df['clean_text'][tweets_df["sentiment"] == 'negative']
stop_words = ["https", "co", "RT"] + list(STOPWORDS)
negative_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(negative_tweets))
plt.figure()
plt.title("Negative Tweets - Wordcloud")
plt.imshow(negative_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()