In [1]:
import re
import time
from datetime import datetime

import numpy as np
import pandas as pd
import snscrape.modules.twitter as sntwitter
from scipy.special import softmax
from transformers import (AutoConfig, AutoModelForSequenceClassification,
                          AutoTokenizer)

# 1. Data Scraping (Do Not Run)

In [None]:
def getTweets(account, since, until):
    tweets = []
    # Call scraper to collect data
    since = since.strftime("%Y-%m-%d")
    until = until.strftime("%Y-%m-%d")

    for tweet in sntwitter.TwitterSearchScraper(f"from:{account} since:{since} until:{until} exclude:replies").get_items():
        data = {
            "url": tweet.url,
            "date": tweet.date,
            "rawContent": tweet.rawContent,
            "tweetId": tweet.id,
            "username": tweet.user.username,
            "userId": tweet.user.id,
            "userDisplayName": tweet.user.displayname,
            "userVerified": tweet.user.verified,
            "replyCount": tweet.replyCount,
            "retweetCount": tweet.retweetCount, 
            "likeCount": tweet.likeCount, 
            "quoteCount": tweet.quoteCount,
            "conversationId": tweet.conversationId,
            "lang": tweet.lang,
            "sourceUrl": tweet.sourceUrl,
            "sourceLabel": tweet.sourceLabel,
            "retweetedTweet": tweet.retweetedTweet,
            "quotedTweet": tweet.quotedTweet,
            "inReplyToTweetId": tweet.inReplyToTweetId,
            "inReplyToUser": tweet.inReplyToUser,
            "coordinates": tweet.coordinates,
            "place": tweet.place,
            "hashtags": tweet.hashtags,
            "cashtags": tweet.cashtags,
            "card": tweet.card,
            "viewCount": tweet.viewCount,
            "vibe": tweet.vibe,
            "bookmarkCount": tweet.bookmarkCount,
            "pinned": tweet.pinned,
            "conversationControlPolicy": tweet.conversationControlPolicy.value
        }

        tweets.append(data)

    # Convert to DataFrame
    df = pd.DataFrame(tweets)

    # Save the DataFrame
    filePath = f"./Datasets/Tweets/{account}_tweets_from_{since}_to_{until}.csv"
    df.to_csv(filePath, index=False)
    print(f"Saved to {filePath}")

    # Display the DataFrame
    display(df.head(3))

In [None]:
def getMentioned(account, since, until):
    tweets = []
    since = since.strftime("%Y-%m-%d")
    until = until.strftime("%Y-%m-%d")

    # Call scraper to collect data
    for tweet in sntwitter.TwitterSearchScraper(f"to:{account} since:{since} until:{until}").get_items():
        data = {
            "url": tweet.url,
            "date": tweet.date,
            "rawContent": tweet.rawContent,
            "tweetId": tweet.id,
            "username": tweet.user.username,
            "userId": tweet.user.id,
            "userDisplayName": tweet.user.displayname,
            "userVerified": tweet.user.verified,
            "replyCount": tweet.replyCount,
            "retweetCount": tweet.retweetCount, 
            "likeCount": tweet.likeCount, 
            "quoteCount": tweet.quoteCount,
            "conversationId": tweet.conversationId,
            "lang": tweet.lang,
            "sourceUrl": tweet.sourceUrl,
            "sourceLabel": tweet.sourceLabel,
            "retweetedTweet": tweet.retweetedTweet,
            "quotedTweet": tweet.quotedTweet,
            "inReplyToTweetId": tweet.inReplyToTweetId,
            "inReplyToUser": tweet.inReplyToUser,
            "coordinates": tweet.coordinates,
            "place": tweet.place,
            "hashtags": tweet.hashtags,
            "cashtags": tweet.cashtags,
            "card": tweet.card,
            "viewCount": tweet.viewCount,
            "vibe": tweet.vibe,
            "bookmarkCount": tweet.bookmarkCount, 
            "pinned": tweet.pinned,
            "conversationControlPolicy": tweet.conversationControlPolicy.value
        }

        tweets.append(data)

    # Convert to DataFrame
    df = pd.DataFrame(tweets)

    # Save the DataFrame
    filePath = f"./Datasets/Mentioned/{account}_mentioned_from_{since}_to_{until}.csv"
    df.to_csv(filePath, index=False)
    print(f"Saved to {filePath}")

    # Display the DataFrame
    display(df.head(3))

In [None]:
def getTweetsAndReplies(account, since, until):
    tweets = []
    # Call scraper to collect data
    since = since.strftime("%Y-%m-%d")
    until = until.strftime("%Y-%m-%d")

    for tweet in sntwitter.TwitterSearchScraper(f"from:{account} since:{since} until:{until}").get_items():
        data = {
            "url": tweet.url,
            "date": tweet.date,
            "rawContent": tweet.rawContent,
            "tweetId": tweet.id,
            "username": tweet.user.username,
            "userId": tweet.user.id,
            "userDisplayName": tweet.user.displayname,
            "userVerified": tweet.user.verified,
            "replyCount": tweet.replyCount,
            "retweetCount": tweet.retweetCount, 
            "likeCount": tweet.likeCount, 
            "quoteCount": tweet.quoteCount,
            "conversationId": tweet.conversationId,
            "lang": tweet.lang,
            "sourceUrl": tweet.sourceUrl,
            "sourceLabel": tweet.sourceLabel,
            "retweetedTweet": tweet.retweetedTweet,
            "quotedTweet": tweet.quotedTweet,
            "inReplyToTweetId": tweet.inReplyToTweetId,
            "inReplyToUser": tweet.inReplyToUser,
            "coordinates": tweet.coordinates,
            "place": tweet.place,
            "hashtags": tweet.hashtags,
            "cashtags": tweet.cashtags,
            "card": tweet.card,
            "viewCount": tweet.viewCount,
            "vibe": tweet.vibe,
            "bookmarkCount": tweet.bookmarkCount,
            "pinned": tweet.pinned,
            "conversationControlPolicy": tweet.conversationControlPolicy.value
        }

        tweets.append(data)

    # Convert to DataFrame
    df = pd.DataFrame(tweets)

    # Save the DataFrame
    filePath = f"./Datasets/SA_Data/{account}_tweets_from_{since}_to_{until}.csv"
    df.to_csv(filePath, index=False)
    print(f"Saved to {filePath}")

    # Display the DataFrame
    display(df.head(3))

### -------------------
### Tweets
### -------------------
- Collected from 1 June 2022 - 31 May 2023 for all 5 brands
- Saved in `./Datasets/Tweets`

##### Sprite

In [None]:
getTweets("Sprite", datetime(2022, 6, 1), datetime(2023, 5, 31))

##### CocaCola

In [None]:
getTweets("CocaCola", datetime(2022, 6, 1), datetime(2023, 5, 31))

##### PrimeHydrate

In [None]:
getTweets("PrimeHydrate", datetime(2022, 6, 1), datetime(2023, 5, 31))

##### fanta

In [None]:
getTweets("Fanta", datetime(2022, 6, 1), datetime(2023, 5, 31))

##### pepsi

In [None]:
getTweets("Pepsi", datetime(2022, 6, 1), datetime(2023, 5, 31))

### -----------
### Mentioned
### -----------
- Collected from 1 May 2023 - 31 May 2023 for all 5 brands
- Saved in `./Datasets/Mentioned`

##### Sprite

In [None]:
getMentioned("Sprite", datetime(2023, 5, 1), datetime(2023, 5, 31))

##### CocaCola

In [None]:
getMentioned("CocaCola", datetime(2023, 5, 1), datetime(2023, 5, 31))

##### PrimeHydrate

In [None]:
getMentioned("PrimeHydrate", datetime(2023, 5, 1), datetime(2023, 5, 31))

##### fanta

In [None]:
getMentioned("Fanta", datetime(2023, 5, 1), datetime(2023, 5, 31))

##### pepsi

In [None]:
getMentioned("Pepsi", datetime(2023, 5, 1), datetime(2023, 5, 31))

### -----------
### Tweets and Replies
### ------------ 

- Collected from 1 Dec 2022 - 31 May 2023 for all 5 brands
- Saved in `./Datasets/SA_Data`

##### Sprite

In [None]:
getTweetsAndReplies("Sprite", datetime(2022, 12, 1), datetime(2023, 5, 31))

##### CocaCola

In [None]:
getTweetsAndReplies("CocaCola", datetime(2022, 12, 1), datetime(2023, 5, 31))

##### PrimeHydrate

In [None]:
getTweetsAndReplies("PrimeHydrate", datetime(2022, 12, 1), datetime(2023, 5, 31))

##### Fanta

In [None]:
getTweetsAndReplies("Fanta", datetime(2022, 12, 1), datetime(2023, 5, 31))

##### Pepsi

In [None]:
getTweetsAndReplies("Pepsi", datetime(2022, 12, 1), datetime(2023, 5, 31))

# 2. Calculate Metrics

In [2]:
def getBrand(filePath):
    pattern = r"(\w+)_[tweets|mentioned]"
    matches = re.search(pattern, filePath)

    if matches:
        return matches.group(1)
    raise ValueError(f"No brand found in the filePath '{filePath}'")

In [3]:
def getSinceUntilDate(filePath):
    # Extract the dates using regular expressions
    pattern = r"from_(\d{4}-\d{2}-\d{2})_to_(\d{4}-\d{2}-\d{2})"
    matches = re.search(pattern, filePath)

    if matches:
        since = datetime.strptime(matches.group(1), "%Y-%m-%d")
        until = datetime.strptime(matches.group(2), "%Y-%m-%d")
        return since, until
    else:
        raise ValueError(f"No dates found in the filePath '{filePath}'")


#### 1. Sentiment Score
- Apply [Twitter-roBERTa-base model for Sentiment Analysis
](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment)
- *Take very long time to run*

In [None]:
# Inference
start_time = time.time()

MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

df_lists = ['cc', 'fanta', 'pepsi', 'ph', 'sprite']

for df_list in df_lists:
    try :
      df = pd.read_csv(f'./Datasets/SA_Data/{df_list}.csv')
      print("Working on: ", df_list) 
      df['text'] = df['rawContent'].str.replace('@user', '')
      df['pred'] = 0

      output = model(**tokenizer(df['text'].tolist(), padding=True, truncation=True, return_tensors="pt"))

      for i in range(df.shape[0]):
            scores = output[0][i].detach().numpy()
            scores = softmax(scores)
            ranking = np.argsort(scores)
            ranking = ranking[::-1]
            print(ranking)

            df['pred'][i] = ranking[0]

      df.to_csv(f'./SA_Pred/{df_list}_pred.csv', index=False)
      print("Time taken: ", time.time() - start_time)

    except Exception as e:
      print('Failed on: ', df_list)
      print(e)


In [4]:
# Analysis
def getAvgPositiveSentimentScore(df):
    df = df[["pred"]]
    average = df[df["pred"] == 2].count() / df.count()

    print(f"Average Positive Score: {int(average * 100)}%")

### 2. Tweet Impression

In [5]:
def getTweetImpression(df):
    totalTweets = df.shape[0]
    print(f"Total Tweets: {totalTweets}")

    totalViews = df["viewCount"].sum()
    print(f"Total Views: {totalViews}")

    avgViewsPerTweet = round(totalViews / totalTweets, 2)
    print(f"Impression: {avgViewsPerTweet}")

    return avgViewsPerTweet

#### 3. Tweeting Frequency

In [6]:
def getNumTweetsPerDay(df, since, until):
    totalDays =  (until - since).days
    print(f"Total Days: {totalDays}")

    totalTweets = df.shape[0]
    print(f"Total Tweets: {totalTweets}")

    numTweetsPerDay = round(totalTweets / totalDays, 2)
    print(f"Number of Tweets per Day: {numTweetsPerDay}")

    return numTweetsPerDay

#### 4. Average Engagement Rate

In [7]:
def getAvgEngagementRatePerTweetByFollowers(df, totalFollowers):
    # Total Likes
    totalLikes = df["likeCount"].sum()
    print(f"Total Likes: {totalLikes}")

    # Total Retweets
    totalRetweets = df["retweetCount"].sum()
    print(f"Total Retweets: {totalRetweets}")

    # Total Tweets
    totalTweets = df.shape[0]
    print(f"Total Tweets: {totalTweets}")

    avgEngagementRate = round(((totalLikes + totalRetweets) / totalTweets) / totalFollowers, 2) * 100

    result = ""
    if avgEngagementRate < 0.5:
        result = "Poor"
    elif avgEngagementRate >= 0.5 and avgEngagementRate <= 1:
        result = "Average"
    else:
        result = "Excellent"
    print(f"Average Engagement Rate: {avgEngagementRate}% ({result})")

    return avgEngagementRate

#### 5. Daily Brand Mention

In [8]:
def getAvgMentionedPerDay(df, since, until):
    totalDays =  (until - since).days
    print(f"Total Days: {totalDays}")

    totalMentioned = df.shape[0]
    print(f"Total Mentioned: {totalMentioned}")

    numMentionedPerDay = round(totalMentioned / totalDays, 2)
    print(f"Number of Mentioned per Day: {numMentionedPerDay}")

    return numMentionedPerDay

#### Function to Run All Metrics

In [9]:
def getMetrics(predPath, tweetPath, mentionedPath, totalFollowers):
    brand = getBrand(tweetPath)
    print()
    print("----------------------------------------")
    print(f"{brand}")
    print("----------------------------------------")

    print(f"Reading from '{predPath}'")
    print(f"Reading from '{tweetPath}'")
    print(f"Reading from '{mentionedPath}'")

    dfPred = pd.read_csv(predPath)
    print("\nMetric 1: Average Positive Sentiment Score")
    getAvgPositiveSentimentScore(dfPred)

    dfTweets = pd.read_csv(tweetPath)
    dfTweets["date"] = pd.to_datetime(dfTweets["date"]).dt.date # convert date to datetime object
    sinceT, untilT  = getSinceUntilDate(tweetPath) # get start and end date
    print("\nMetric 2: Tweet Impression")
    getTweetImpression(dfTweets)
    print("\nMetric 3: Number of Tweets per Day")
    getNumTweetsPerDay(dfTweets, sinceT, untilT)
    print("\nMetric 4: Average Engagement Rate per Tweet by Followers")
    getAvgEngagementRatePerTweetByFollowers(dfTweets, totalFollowers)

    dfMentioned = pd.read_csv(mentionedPath)
    dfMentioned["date"] = pd.to_datetime(dfMentioned["date"]).dt.date # convert date to datetime object
    sinceM, untilM  = getSinceUntilDate(mentionedPath) # get start and end date
    print("\nMetric 5: Average Mentioned per Day")
    getAvgMentionedPerDay(dfMentioned, sinceM, untilM)


#### Cell to Get The Metrics' Result

In [10]:
predPaths = [
   "./Datasets/SA_Pred/cc_pred.csv",
   "./Datasets/SA_Pred/fanta_pred.csv",
   "./Datasets/SA_Pred/pepsi_pred.csv",
   "./Datasets/SA_Pred/ph_pred.csv",
   "./Datasets/SA_Pred/sprite_pred.csv",
]

tweetPaths = [
    "./Datasets/Tweets/CocaCola_tweets_from_2022-06-01_to_2023-05-31.csv",
    "./Datasets/Tweets/Fanta_tweets_from_2022-06-01_to_2023-05-31.csv",
    "./Datasets/Tweets/Pepsi_tweets_from_2022-06-01_to_2023-05-31.csv",
    "./Datasets/Tweets/PrimeHydrate_tweets_from_2022-06-01_to_2023-05-31.csv",
    "./Datasets/Tweets/Sprite_tweets_from_2022-06-01_to_2023-05-31.csv"
]

mentionedPaths = [
    "./Datasets/Mentioned/CocaCola_mentioned_from_2023-05-01_to_2023-05-31.csv",
    "./Datasets/Mentioned/Fanta_mentioned_from_2023-05-01_to_2023-05-31.csv",
    "./Datasets/Mentioned/Pepsi_mentioned_from_2023-05-01_to_2023-05-31.csv",
    "./Datasets/Mentioned/PrimeHydrate_mentioned_from_2023-05-01_to_2023-05-31.csv",
    "./Datasets/Mentioned/Sprite_mentioned_from_2023-05-01_to_2023-05-31.csv"
]

# Collected manually from Social Blade as of 30 June
totalFollowersList = [
    3371668, # cola
    156346, # fanta
    3085508, # pepsi
    391154, # prime hydrate
    299037 # sprite
]

for i in range(5):
    getMetrics(predPaths[i], tweetPaths[i], mentionedPaths[i], totalFollowersList[i])


----------------------------------------
CocaCola
----------------------------------------
Reading from './Datasets/SA_Pred/cc_pred.csv'
Reading from './Datasets/Tweets/CocaCola_tweets_from_2022-06-01_to_2023-05-31.csv'
Reading from './Datasets/Mentioned/CocaCola_mentioned_from_2023-05-01_to_2023-05-31.csv'

Metric 1: Average Positive Sentiment Score
Average Positive Score: 42%

Metric 2: Tweet Impression
Total Tweets: 82
Total Views: 1684887.0
Impression: 20547.4

Metric 3: Number of Tweets per Day
Total Days: 364
Total Tweets: 82
Number of Tweets per Day: 0.23

Metric 4: Average Engagement Rate per Tweet by Followers
Total Likes: 18561
Total Retweets: 3682
Total Tweets: 82
Average Engagement Rate: 0.0% (Poor)

Metric 5: Average Mentioned per Day
Total Days: 30
Total Mentioned: 175
Number of Mentioned per Day: 5.83

----------------------------------------
Fanta
----------------------------------------
Reading from './Datasets/SA_Pred/fanta_pred.csv'
Reading from './Datasets/Tweets/F