In [26]:
import pandas as pd
import torch
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

In [5]:
df_vaccine = pd.read_csv('data/Ads_vaccine_sample.csv').drop(['Unnamed: 0', 'ad_creative_link_captions','ad_creative_link_descriptions'],axis=1)
df_vaccine.head()

Unnamed: 0,ad_creation_time,ad_creative_bodies,ad_creative_link_titles,ad_delivery_start_time,ad_delivery_stop_time,delivery_by_region,demographic_distribution,estimated_audience_size,impressions,spend,id
0,2021-04-16,When it comes to medical decisions it should b...,['Wisconsin DHS pauses Johnson & Johnson COVID...,2021-04-16,2021-04-18,"[{'percentage': '0.201422', 'region': 'Minneso...","[{'percentage': '0.001183', 'age': '65+', 'gen...","{'lower_bound': '100001', 'upper_bound': '5000...","{'lower_bound': '1000', 'upper_bound': '1999'}","{'lower_bound': '0', 'upper_bound': '99'}",242120184266932
1,2021-12-03,It’s all about issues we should be paying more...,"['Altruisa', ' ', ' ', ' ', ' ', ' ', ' ', ' ']",2021-12-04,2021-12-08,"[{'percentage': '0.45181', 'region': 'Californ...","[{'percentage': '0.003579', 'age': '25-34', 'g...",{'lower_bound': '1000001'},"{'lower_bound': '2000', 'upper_bound': '2999'}","{'lower_bound': '0', 'upper_bound': '99'}",611908426810835
2,2021-06-11,"I will let you decide for yourself, but take a...",['UW Health - Vaccine PSA'],2021-06-11,2021-06-16,"[{'percentage': '1', 'region': 'Wisconsin'}]","[{'percentage': '0.000145', 'age': '18-24', 'g...",{'lower_bound': '1000001'},"{'lower_bound': '10000', 'upper_bound': '14999'}","{'lower_bound': '0', 'upper_bound': '99'}",899414300908039
3,2021-09-30,Small businesses are the lifeblood of the comm...,['VOTE to support small business in La Mesa'],2021-10-01,2021-10-05,"[{'percentage': '1', 'region': 'California'}]","[{'percentage': '0.004132', 'age': '35-44', 'g...","{'lower_bound': '10001', 'upper_bound': '50000'}","{'lower_bound': '0', 'upper_bound': '999'}","{'lower_bound': '0', 'upper_bound': '99'}",896239077981131
4,2021-06-07,जय श्री राम,,2021-06-07,2021-06-07,,,{'lower_bound': '1000001'},"{'lower_bound': '0', 'upper_bound': '999'}","{'lower_bound': '0', 'upper_bound': '99'}",156814823130922


## Initializing Model

In [6]:
# load model and tokenizer
roberta = "cardiffnlp/twitter-roberta-base-sentiment"

model = AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)

labels = ['Negative', 'Neutral', 'Positive']

In [7]:
def get_sentiment(tweet_proc):
    # sentiment analysis
    encoded_tweet = tokenizer(tweet_proc, return_tensors='pt')
    # output = model(encoded_tweet['input_ids'], encoded_tweet['attention_mask'])
    output = model(**encoded_tweet)

    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    sentiment = labels[np.argmax(scores)]
    
    return sentiment


In [8]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;\\n]"')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

def preprocess(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = re.sub(r'www.+','', text)
    text = re.sub(r'https.+','', text)
    text = re.sub(r'#',' #',text)
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    return text 

In [10]:
df_vaccine['ad_creative_bodies'] = df_vaccine['ad_creative_bodies'].astype(str).apply(preprocess)

In [11]:
df_vaccine['sentiment'] = df_vaccine['ad_creative_bodies'].apply(lambda x: get_sentiment(x[:512]))

In [12]:
df_vaccine['sentiment'].value_counts()

Neutral     4822
Positive    1768
Negative    1657
Name: sentiment, dtype: int64

In [24]:
df_vaccine.head()

Unnamed: 0,ad_creation_time,ad_creative_bodies,ad_creative_link_titles,ad_delivery_start_time,ad_delivery_stop_time,delivery_by_region,demographic_distribution,estimated_audience_size,impressions,spend,id,sentiment
0,2021-04-16,when it comes to medical decisions it should b...,['Wisconsin DHS pauses Johnson & Johnson COVID...,2021-04-16,2021-04-18,"[{'percentage': '0.201422', 'region': 'Minneso...","[{'percentage': '0.001183', 'age': '65+', 'gen...","{'lower_bound': '100001', 'upper_bound': '5000...","{'lower_bound': '1000', 'upper_bound': '1999'}","{'lower_bound': '0', 'upper_bound': '99'}",242120184266932,Negative
1,2021-12-03,its all about issues we should be paying more ...,"['Altruisa', ' ', ' ', ' ', ' ', ' ', ' ', ' ']",2021-12-04,2021-12-08,"[{'percentage': '0.45181', 'region': 'Californ...","[{'percentage': '0.003579', 'age': '25-34', 'g...",{'lower_bound': '1000001'},"{'lower_bound': '2000', 'upper_bound': '2999'}","{'lower_bound': '0', 'upper_bound': '99'}",611908426810835,Neutral
2,2021-06-11,i will let you decide for yourself but take a ...,['UW Health - Vaccine PSA'],2021-06-11,2021-06-16,"[{'percentage': '1', 'region': 'Wisconsin'}]","[{'percentage': '0.000145', 'age': '18-24', 'g...",{'lower_bound': '1000001'},"{'lower_bound': '10000', 'upper_bound': '14999'}","{'lower_bound': '0', 'upper_bound': '99'}",899414300908039,Neutral
3,2021-09-30,small businesses are the lifeblood of the comm...,['VOTE to support small business in La Mesa'],2021-10-01,2021-10-05,"[{'percentage': '1', 'region': 'California'}]","[{'percentage': '0.004132', 'age': '35-44', 'g...","{'lower_bound': '10001', 'upper_bound': '50000'}","{'lower_bound': '0', 'upper_bound': '999'}","{'lower_bound': '0', 'upper_bound': '99'}",896239077981131,Positive
4,2021-06-07,,,2021-06-07,2021-06-07,,,{'lower_bound': '1000001'},"{'lower_bound': '0', 'upper_bound': '999'}","{'lower_bound': '0', 'upper_bound': '99'}",156814823130922,Neutral
