# BERT sentiment

In [1]:
import pandas as pd
from sklearn.metrics import balanced_accuracy_score
import numpy as np

In [2]:
from transformers import BertForSequenceClassification, BertTokenizer

In [52]:
def run_bert_sentiment(list_of_contents):
  sentiment_list=[" " for i in range(len(list_of_contents))]
  model = BertForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
  tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

  i=0
  while i < len(list_of_contents):
    if i%200==0:
      print(i)
    input_ids = tokenizer.encode(list_of_contents[i], return_tensors="pt", max_length=512, truncation=True)
    # if input_ids.size(1)>512:
    #   input_ids=input_ids[:, :512]
    output=model(input_ids)
    predictions = output.logits.argmax(dim=1)
    # sentiment_lab=['1 star', '2 stars', '3 stars', '4 stars', '5 stars']
    sentiment_lab=['negative', 'negative', 'neutral', 'positive', 'positive']
    sentiment_list[i]=sentiment_lab[predictions.item()]
    i+=1
  return sentiment_list

# RoBERTa

In [46]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [47]:
def run_roberta_sentiment(list_of_contents):
  sentiment_list=[" " for i in range(len(list_of_contents))]
  model_name="textattack/roberta-base-SST-2"
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
  tokenizer = AutoTokenizer.from_pretrained(model_name)

  i=0
  while i < len(list_of_contents):
    if i%200==0:
      print(i)
    input_ids = tokenizer.encode(list_of_contents[i], return_tensors="pt", max_length=512, truncation=True)
    output=model(input_ids)
    predictions = output.logits.argmax().item()
    sentiment_lab=['negative', 'positive']
    sentiment_list[i]=sentiment_lab[predictions]
    i+=1
  return sentiment_list

### Twitter sentiment dataset

In [32]:
test_tse_data=pd.read_csv("/source_repository/other_datasets_for_tests/tweet-sentiment-extraction/test.csv")

In [33]:
test_tse_data["predicted_sentiment_bert"]=''

In [34]:
tse_texts_list=list(test_tse_data['text'])

In [60]:
tse_sentiment_list=run_bert_sentiment(tse_texts_list)

0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400


In [61]:
test_tse_data['predicted_sentiment_bert']=tse_sentiment_list
test_tse_data

Unnamed: 0,textID,text,sentiment,predicted_sentiment_bert
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,positive
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,negative
3,01082688c6,happy bday!,positive,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,positive
...,...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative,neutral
3530,416863ce47,All alone in this old house again. Thanks for...,positive,positive
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative,negative
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive,positive


In [62]:
accuracy=np.sum(test_tse_data['sentiment']==test_tse_data['predicted_sentiment_bert'])/test_tse_data.shape[0]

In [63]:
ba_tse=balanced_accuracy_score(test_tse_data['sentiment'], test_tse_data['predicted_sentiment_bert'])

In [64]:
accuracy, ba_tse

(0.5325410299943407, 0.5755031610879299)

### roberta

In [48]:
tse_sentiment_list_roberta=run_roberta_sentiment(tse_texts_list)

config.json:   0%|          | 0.00/525 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400


In [57]:
accuracy_roberta=np.sum(test_tse_data['sentiment']==tse_sentiment_list_roberta)/test_tse_data.shape[0]

In [58]:
ba_tse_roberta=balanced_accuracy_score(test_tse_data['sentiment'], tse_sentiment_list_roberta)

In [59]:
accuracy_roberta, ba_tse_roberta

(0.5181097906055461, 0.580623063850625)

### New sentiment dataset

In [4]:
test_ns_data2=pd.read_csv("/source_repository/other_datasets_for_tests/news_sentiment_dataset/Sentiment_dataset.csv")
test_ns_data2

Unnamed: 0,news_title,reddit_title,sentiment,text,url
0,Mark Cuban launches generic drug company,Billionaire Mark Cuban just launched a drug co...,1.0,Billionaire investor and Shark Tank star Mark ...,https://www.beckershospitalreview.com/pharmacy...
1,From Defendant to Defender: One Wrongfully Con...,"Man falsely imprisoned for 10 years, uses pris...",1.0,Attorney Jarrett Adams recently helped overtur...,https://www.nbcnews.com/news/us-news/defendant...
2,"Amazon Tribe Wins Lawsuit Against Big Oil, Sav...",Amazon tribe wins legal battle against oil com...,1.0,The Amazon Rainforest is well known across the...,https://www.disclose.tv/amazon-tribe-wins-laws...
3,Newark police: No officer fired a single shot ...,Newark police: No officer fired a single shot ...,1.0,Newark police: No officer fired a single shot ...,https://newjersey.news12.com/newark-police-no-...
4,Ingen barn døde i trafikken i 2019,No children died in traffic accidents in Norwa...,1.0,I 1970 døde det 560 mennesker i den norske tra...,https://www.nrk.no/trondelag/ingen-barn-dode-i...
...,...,...,...,...,...
843,Dee Why attack: Man allegedly choked and threa...,Dee Why attack: Man allegedly choked and threa...,0.0,Frightening details have emerged about a toile...,https://www.9news.com.au/2018/11/30/17/55/sydn...
844,Africa: Children and HIV/Aids - 'We Need to Ta...,Africa: Children and HIV/Aids - 'We Need to Ta...,0.0,"interview\n\nJohannesburg — 360,000 adolescent...",https://allafrica.com/stories/201811300567.html
845,Terrorism suspected in Eilat attack,Terrorism suspected in Eilat attack,0.0,A violent attack in the southern Israeli port ...,http://www.israelnationalnews.com/News/News.as...
846,Anti-Semitism never disappeared in Europe. It'...,Anti-Semitism never disappeared in Europe. It'...,0.0,"It's a 17-year-old boy, too frightened to wear...",https://edition.cnn.com/2018/11/27/europe/anti...


In [5]:
def map_values(x):
  if x==1:
    return 'positive'
  elif x==0:
    return 'negative'
  else:
    return 'neutral'

test_ns_data2['sentiment']=test_ns_data2['sentiment'].apply(map_values)

In [6]:
def cut_to_bert(string_list, max_words=300):
  cut_strings=["" for _ in string_list]
  for i, string in enumerate(string_list):
    words=string.split()[:max_words]
    joined=' '.join(words)
    cut_strings[i]=joined
  return cut_strings

In [7]:
test_ns_data2["predicted_sentiment_bert"]=''

In [8]:
ns_texts_list=test_ns_data2['text']
ns_sentiment_list=run_bert_sentiment(ns_texts_list)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

0


Token indices sequence length is longer than the specified maximum sequence length for this model (1404 > 512). Running this sequence through the model will result in indexing errors


200
400
600
800


In [24]:
sum=0
for i in range(len(list(test_ns_data2['sentiment']))):
  if test_ns_data2['sentiment'][i]==ns_sentiment_list[i]:
    sum+=1

In [26]:
accuracy_ns=sum/test_ns_data2.shape[0]

In [30]:
ba_ns=balanced_accuracy_score(list(test_ns_data2['sentiment']), ns_sentiment_list)



In [31]:
accuracy_ns, ba_ns

(0.5577830188679245, 0.5284224598930481)

### roberta

In [66]:
ns_sentiment_list_roberta=run_roberta_sentiment(ns_texts_list)

Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0
200
400
600
800


In [67]:
sum_roberta=0
for i in range(len(list(test_ns_data2['sentiment']))):
  if test_ns_data2['sentiment'][i]==ns_sentiment_list_roberta[i]:
    sum_roberta+=1

In [68]:
accuracy_ns_roberta=sum_roberta/test_ns_data2.shape[0]

In [69]:
ba_ns_roberta=balanced_accuracy_score(list(test_ns_data2['sentiment']), ns_sentiment_list_roberta)

In [70]:
accuracy_ns_roberta, ba_ns_roberta

(0.7900943396226415, 0.7770588235294118)