In [2]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [100]:
df = pd.read_csv('data/tweets.csv')
df.head(15)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,No emotion toward brand or product
6,,,No emotion toward brand or product
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive emotion
8,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,Positive emotion
9,Counting down the days to #sxsw plus strong Ca...,Apple,Positive emotion


In [101]:
#rename columns
df.set_axis(['tweet', 'directed', 'emotion'], axis=1, inplace=True)
df.head()

  df.set_axis(['tweet', 'directed', 'emotion'], axis=1, inplace=True)


Unnamed: 0,tweet,directed,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [102]:
df['emotion'].value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: emotion, dtype: int64

In [5]:
#TweetEval preprocess function

def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [6]:
!pip install transformers



In [7]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [8]:
import torch

In [9]:
print(torch.__version__)

2.0.0


In [10]:
# download label mapping
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [11]:
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL, from_tf=True)

text = "Good night 😊"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.


In [12]:
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

1) positive 0.8466
2) neutral 0.1458
3) negative 0.0076


In [13]:
def clean_roberta(df):
    clean_tweets = []
    for tweet in df['tweet']:
        words = str(tweet).split()
        new_text = []
        for t in words:
            t = '@user' if t.startswith('@') and len(t) > 1 else t
            t = '@user' if t.startswith('.@') and len(t) > 1 else t
            t = 'http' if t.startswith('http') else t
            new_text.append(t)
        new_t = " ".join(new_text)
        clean_tweets.append(new_t)
    df['clean_tweet'] = clean_tweets
    return df

In [14]:
clean_roberta = clean_roberta(df)

In [15]:
clean_roberta.head()

Unnamed: 0,tweet,directed,emotion,clean_tweet
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,@user I have a 3G iPhone. After 3 hrs tweeting...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,@user Know about @user ? Awesome iPad/iPhone a...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,@user Can not wait for #iPad 2 also. They shou...
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,@user I hope this year's festival isn't as cra...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,@user great stuff on Fri #SXSW: Marissa Mayer ...


In [16]:
#encoded_input_df = clean_roberta['clean_tweet'].apply(tokenizer.tokenize)

In [17]:
#encoded_input_df.head()

In [18]:
#output = model(**encoded_input_df)
#scores = output[0][0].detach().numpy()
#scores = softmax(scores)

In [19]:
test = clean_roberta['clean_tweet'][5]
test

'@user New iPad Apps For #SpeechTherapy And Communication Are Showcased At The #SXSW Conference http #iear #edchat #asd'

In [20]:
#testing model

encoded_input_test = tokenizer(test, return_tensors='pt')
encoded_input_test

{'input_ids': tensor([[    0,  1039, 12105,   188,  9481, 29324,   286,   849, 29235,  7529,
         43958, 32748,   178, 17051,  3945,  2907,   438, 11835,   497,    20,
           849,   104,  1000, 11871,  2815,  2054,   849,   118,  4352,   849,
           196, 29465,   849,   281,   417,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [21]:
#testing model
output_test = model(**encoded_input_test)
output_test

SequenceClassifierOutput(loss=None, logits=tensor([[-3.3895,  2.1188,  1.3288]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [22]:
#testing model
scores_test = output_test[0][0].detach().numpy()
scores_test = softmax(scores_test)
scores_test

array([0.00277996, 0.68592894, 0.31129107], dtype=float32)

In [23]:
#testing model
ranking_test = np.argsort(scores_test)
ranking_test = ranking_test[::-1]
for i in range(scores_test.shape[0]):
    l = labels[ranking_test[i]]
    s = scores_test[ranking_test[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

1) neutral 0.6859
2) positive 0.3113
3) negative 0.0028


In [24]:
test2 = clean_roberta['clean_tweet'][0]
encoded_input_test2 = tokenizer(test2, return_tensors='pt')
output_test2 = model(**encoded_input_test2)
scores_test2 = output_test2[0][0].detach().numpy()
scores_test2 = softmax(scores_test2)
ranking_test2 = np.argsort(scores_test2)
ranking_test2 = ranking_test2[::-1]
for i in range(scores_test2.shape[0]):
    l = labels[ranking_test2[i]]
    s = scores_test2[ranking_test2[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")
print(test2)

1) negative 0.4247
2) neutral 0.397
3) positive 0.1783
@user I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.


In [25]:
test3 = clean_roberta['clean_tweet'][5]
encoded_input_test3 = tokenizer(test3, return_tensors='pt')
output_test3 = model(**encoded_input_test3)
scores_test3 = output_test3[0][0].detach().numpy()
scores_test3 = softmax(scores_test3)
ranking_test3 = np.argsort(scores_test3)
ranking_test3 = ranking_test3[::-1]
for i in range(scores_test3.shape[0]):
    l = labels[ranking_test3[i]]
    s = scores_test3[ranking_test3[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")
print(test3)

1) neutral 0.6859
2) positive 0.3113
3) negative 0.0028
@user New iPad Apps For #SpeechTherapy And Communication Are Showcased At The #SXSW Conference http #iear #edchat #asd


In [26]:
scores_test3 = softmax(scores_test3)
score_list = []
neg = scores_test3[0]
score_list.append(neg)
neut = scores_test3[1]
score_list.append(neut)
pos = scores_test3[2]
score_list.append(pos)

for score in score_list:
    if max(score_list) == neg:
        score = 0
    elif max(score_list) == neut:
        score = 1
    elif max(score_list) == pos:
        score = 2
print(test3)
print(score)


@user New iPad Apps For #SpeechTherapy And Communication Are Showcased At The #SXSW Conference http #iear #edchat #asd
1


In [27]:
score_list3 = []
score_list3.append(scores_test3[0])
score_list3.append(scores_test3[1])
score_list3.append(scores_test3[2])

print(score_list3)

def find_max_score_index(scores):
    max_score = max(scores)
    max_index = scores.index(max_score)
    if max_index == 0:
        return 0
    elif max_index == 1:
        return 1
    elif max_index == 2:
        return 2
    else:
        return "Error: Maximum score not found in first three indices"
        
find_max_score_index(score_list3)

[0.23033516, 0.45608738, 0.3135775]


1

In [28]:
scores_test2 = softmax(scores_test2)
score_list2 = []
score_list2.append(scores_test2[0])
score_list2.append(scores_test2[1])
score_list2.append(scores_test2[2])

print(score_list2)

def find_max_score_index(scores):
    sentiment_score = []
    max_score = max(scores)
    max_index = scores.index(max_score)
    if max_index == 0:
        sentiment_score.append(0)
    elif max_index == 1:
        sentiment_score.append(1)
    elif max_index == 2:
        sentiment_score.append(2)
    else:
        sentiment_score.append(5)
    return sentiment_score
        
find_max_score_index(score_list2)

[0.3630661, 0.35315055, 0.28378338]


[0]

In [29]:
clean_roberta.head()

Unnamed: 0,tweet,directed,emotion,clean_tweet
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,@user I have a 3G iPhone. After 3 hrs tweeting...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,@user Know about @user ? Awesome iPad/iPhone a...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,@user Can not wait for #iPad 2 also. They shou...
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,@user I hope this year's festival isn't as cra...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,@user great stuff on Fri #SXSW: Marissa Mayer ...


In [30]:
def model_roberta(df):
    sentiment = []
    
    #encode and run through model in clean_tweets
    for tweet in df['tweet']:
        encoded_input = tokenizer(str(tweet), return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)

        score_list = []
        score_list.append(scores[0])
        score_list.append(scores[1])
        score_list.append(scores[2])

        def find_max_score_index(scores):
            max_score = max(scores)
            max_index = scores.index(max_score)
            if max_index == 0:
                return 0
            elif max_index == 1:
                return 1
            elif max_index == 2:
                return 2
            else:
                return 5
            
        sentiment.append(find_max_score_index(score_list))
        
    clean_roberta['score'] = sentiment

    return df

In [31]:
model_roberta(clean_roberta)

Unnamed: 0,tweet,directed,emotion,clean_tweet,score
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,@user I have a 3G iPhone. After 3 hrs tweeting...,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,@user Know about @user ? Awesome iPad/iPhone a...,2
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,@user Can not wait for #iPad 2 also. They shou...,2
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,@user I hope this year's festival isn't as cra...,1
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,@user great stuff on Fri #SXSW: Marissa Mayer ...,2
...,...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion,Ipad everywhere. #SXSW {link},1
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,"Wave, buzz... RT @user We interrupt your regul...",1
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,"Google's Zeiger, a physician never reported po...",1
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,Some Verizon iPhone customers complained their...,0


In [32]:
clean_roberta['score'].value_counts()

2    4989
1    3423
0     681
Name: score, dtype: int64

# Taking roBERTa a step further

We'd like to look at how roberta categorizes tweets when we apply more preprocessing

In [33]:
test5 = clean_roberta['clean_tweet'][10]
test5

'Excited to meet the @user at #sxsw so I can show them my Sprint Galaxy S still running Android 2.1. #fail'

In [41]:
 def get_wordnet_pos(clean_tweet):
        clean_tweet = pos_tag(str(clean_tweet).split())

        new_tags = []
        for word, tag in clean_tweet:
            if tag.startswith('J'):
                tag = wordnet.ADJ
                new_tags.append([word, tag])
            elif tag.startswith('V'):
                tag = wordnet.VERB
                new_tags.append([word, tag])
            elif tag.startswith('N'):
                tag = wordnet.NOUN
                new_tags.append([word, tag])
            elif tag.startswith('R'):
                tag = wordnet.ADV
                new_tags.append([word, tag])
            else:
                tag = wordnet.NOUN
                new_tags.append([word, tag])


        return new_tags

In [75]:
#imports for preprocessing

import nltk
import string
nltk.download('words')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet

#getting pos

clean_roberta['clean_preprocess'] = clean_roberta['clean_tweet'].apply(get_wordnet_pos)

[nltk_data] Downloading package words to
[nltk_data]     /Users/leahschell/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [76]:
clean_roberta.head()

Unnamed: 0,tweet,directed,emotion,clean_tweet,score,clean_preprocess,final_preprocess
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,@user I have a 3G iPhone. After 3 hrs tweeting...,0,"[[@user, n], [I, n], [have, v], [a, n], [3G, n...","[ [ ' @ u s e r ' , ' n ' ] , [ ' I ' , ..."
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,@user Know about @user ? Awesome iPad/iPhone a...,2,"[[@user, r], [Know, n], [about, n], [@user, n]...","[ [ ' @ u s e r ' , ' r ' ] , [ ' K n o w ..."
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,@user Can not wait for #iPad 2 also. They shou...,2,"[[@user, n], [Can, n], [not, r], [wait, v], [f...","[ [ ' @ u s e r ' , ' n ' ] , [ ' C a n ' ..."
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,@user I hope this year's festival isn't as cra...,1,"[[@user, n], [I, n], [hope, v], [this, n], [ye...","[ [ ' @ u s e r ' , ' n ' ] , [ ' I ' , ..."
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,@user great stuff on Fri #SXSW: Marissa Mayer ...,2,"[[@user, r], [great, a], [stuff, n], [on, n], ...","[ [ ' @ u s e r ' , ' r ' ] , [ ' g r e a ..."


In [80]:
# applying our own preprocessing function based on research about roBERTa

def preprocess(clean_tweet):
    
    lem = WordNetLemmatizer()
    clean_tweet_lemmed = [lem.lemmatize(pair[0], pair[1]) for pair in clean_tweet]
    
    
    preprocess_tweet = [x for x in clean_tweet_lemmed if x not in removewords]
    preprocess_tweet = " ".join(preprocess_tweet)
    
    return preprocess_tweet

In [65]:
test6 = clean_roberta['clean_preprocess'][5]
test6

[['@user', 'r'],
 ['New', 'n'],
 ['iPad', 'n'],
 ['Apps', 'n'],
 ['For', 'n'],
 ['#SpeechTherapy', 'n'],
 ['And', 'n'],
 ['Communication', 'n'],
 ['Are', 'n'],
 ['Showcased', 'v'],
 ['At', 'n'],
 ['The', 'n'],
 ['#SXSW', 'n'],
 ['Conference', 'n'],
 ['http', 'v'],
 ['#iear', 'a'],
 ['#edchat', 'n'],
 ['#asd', 'n']]

In [67]:
lem_test = WordNetLemmatizer()
    
clean_tweet_lemmed_test = [lem_test.lemmatize(pair[0], pair[1]) for pair in test6] 
clean_tweet_lemmed_test

['@user',
 'New',
 'iPad',
 'Apps',
 'For',
 '#SpeechTherapy',
 'And',
 'Communication',
 'Are',
 'Showcased',
 'At',
 'The',
 '#SXSW',
 'Conference',
 'http',
 '#iear',
 '#edchat',
 '#asd']

In [68]:
#removing some words and preprocessing
removewords = ['ып', 'ыќ']

preprocess_tweet_test = [x for x in clean_tweet_lemmed_test if x not in removewords]
preprocess_tweet_test = " ".join(preprocess_tweet_test)
preprocess_tweet_test

'@user New iPad Apps For #SpeechTherapy And Communication Are Showcased At The #SXSW Conference http #iear #edchat #asd'

In [82]:
#removing some words and preprocessing
removewords = ['ып', 'ыќ']

clean_roberta['final_preprocess'] = clean_roberta['clean_preprocess'].apply(preprocess)
clean_roberta.head()

Unnamed: 0,tweet,directed,emotion,clean_tweet,score,clean_preprocess,final_preprocess
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,@user I have a 3G iPhone. After 3 hrs tweeting...,0,"[[@user, n], [I, n], [have, v], [a, n], [3G, n...",@user I have a 3G iPhone. After 3 hr tweeting ...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,@user Know about @user ? Awesome iPad/iPhone a...,2,"[[@user, r], [Know, n], [about, n], [@user, n]...",@user Know about @user ? Awesome iPad/iPhone a...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,@user Can not wait for #iPad 2 also. They shou...,2,"[[@user, n], [Can, n], [not, r], [wait, v], [f...",@user Can not wait for #iPad 2 also. They shou...
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,@user I hope this year's festival isn't as cra...,1,"[[@user, n], [I, n], [hope, v], [this, n], [ye...",@user I hope this year's festival isn't as cra...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,@user great stuff on Fri #SXSW: Marissa Mayer ...,2,"[[@user, r], [great, a], [stuff, n], [on, n], ...",@user great stuff on Fri #SXSW: Marissa Mayer ...


In [84]:
clean_roberta = clean_roberta.drop(['clean_preprocess'], axis = 1)
clean_roberta.head()

Unnamed: 0,tweet,directed,emotion,clean_tweet,score,final_preprocess
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,@user I have a 3G iPhone. After 3 hrs tweeting...,0,@user I have a 3G iPhone. After 3 hr tweeting ...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,@user Know about @user ? Awesome iPad/iPhone a...,2,@user Know about @user ? Awesome iPad/iPhone a...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,@user Can not wait for #iPad 2 also. They shou...,2,@user Can not wait for #iPad 2 also. They shou...
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,@user I hope this year's festival isn't as cra...,1,@user I hope this year's festival isn't as cra...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,@user great stuff on Fri #SXSW: Marissa Mayer ...,2,@user great stuff on Fri #SXSW: Marissa Mayer ...


In [89]:
clean_roberta.set_axis(['original_tweet', 
                               'directed', 
                               'original_score', 
                               'roberta_preprocess_tweet', 
                               'roberta_score', 
                               'final_preprocess'], axis=1, inplace=True)
clean_roberta.head()

  clean_roberta.set_axis(['original_tweet',


Unnamed: 0,original_tweet,directed,original_score,roberta_preprocess_tweet,roberta_score,final_preprocess
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,@user I have a 3G iPhone. After 3 hrs tweeting...,0,@user I have a 3G iPhone. After 3 hr tweeting ...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,@user Know about @user ? Awesome iPad/iPhone a...,2,@user Know about @user ? Awesome iPad/iPhone a...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,@user Can not wait for #iPad 2 also. They shou...,2,@user Can not wait for #iPad 2 also. They shou...
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,@user I hope this year's festival isn't as cra...,1,@user I hope this year's festival isn't as cra...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,@user great stuff on Fri #SXSW: Marissa Mayer ...,2,@user great stuff on Fri #SXSW: Marissa Mayer ...


In [90]:
#Drop one missing tweet
clean_roberta.dropna(subset='tweet', axis=0, inplace=True)

In [93]:
#Drop non-apple values in Directed to column
#Come back to this to make cleaner later??

clean_roberta = clean_roberta[(clean_roberta['directed'] != 'Android App') & (clean_roberta['directed'] != 'Android') & (clean_roberta['directed'] != 'Google') & (clean_roberta['directed'] != 'Other Google product or service')]

In [95]:
emo_dict = {'No emotion toward brand or product': 1, 'Positive emotion': 2, 'Negative emotion': 0}
clean_roberta['original_score'] = clean_roberta['original_score'].map(emo_dict)

clean_roberta.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_roberta['original_score'] = clean_roberta['original_score'].map(emo_dict)


Unnamed: 0,original_tweet,directed,original_score,roberta_preprocess_tweet,roberta_score,final_preprocess
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,0.0,@user I have a 3G iPhone. After 3 hrs tweeting...,0,@user I have a 3G iPhone. After 3 hr tweeting ...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,2.0,@user Know about @user ? Awesome iPad/iPhone a...,2,@user Know about @user ? Awesome iPad/iPhone a...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,2.0,@user Can not wait for #iPad 2 also. They shou...,2,@user Can not wait for #iPad 2 also. They shou...
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,0.0,@user I hope this year's festival isn't as cra...,1,@user I hope this year's festival isn't as cra...
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,1.0,@user New iPad Apps For #SpeechTherapy And Com...,1,@user New iPad Apps For #SpeechTherapy And Com...


In [98]:
clean_roberta.shape

(8211, 6)

In [103]:
clean_roberta['original_score'].info()

<class 'pandas.core.series.Series'>
Int64Index: 8211 entries, 0 to 9092
Series name: original_score
Non-Null Count  Dtype  
--------------  -----  
8057 non-null   float64
dtypes: float64(1)
memory usage: 128.3 KB


In [105]:
clean_roberta['original_tweet'].str.contains('japan').any()

True

In [107]:
clean_roberta['original_tweet'].str.contains('japan').sum()

22

In [108]:
clean_roberta = clean_roberta.drop(clean_roberta[clean_roberta.apply(lambda row: row.astype(str).str.contains('japan').any(), axis=1)].index)
clean_roberta.shape

(8189, 6)

In [109]:
def model_roberta(df):
    sentiment = []
    
    #encode and run through model in clean_tweets
    for tweet in df['final_preprocess']:
        encoded_input = tokenizer(str(tweet), return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)

        score_list = []
        score_list.append(scores[0])
        score_list.append(scores[1])
        score_list.append(scores[2])

        def find_max_score_index(scores):
            max_score = max(scores)
            max_index = scores.index(max_score)
            if max_index == 0:
                return 0
            elif max_index == 1:
                return 1
            elif max_index == 2:
                return 2
            else:
                return 5
            
        sentiment.append(find_max_score_index(score_list))
        
    clean_roberta['final_roberta_score'] = sentiment

    return df

In [110]:
model_roberta(clean_roberta)

Unnamed: 0,original_tweet,directed,original_score,roberta_preprocess_tweet,roberta_score,final_preprocess,final_roberta_score
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,0.0,@user I have a 3G iPhone. After 3 hrs tweeting...,0,@user I have a 3G iPhone. After 3 hr tweeting ...,1
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,2.0,@user Know about @user ? Awesome iPad/iPhone a...,2,@user Know about @user ? Awesome iPad/iPhone a...,2
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,2.0,@user Can not wait for #iPad 2 also. They shou...,2,@user Can not wait for #iPad 2 also. They shou...,2
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,0.0,@user I hope this year's festival isn't as cra...,1,@user I hope this year's festival isn't as cra...,1
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,1.0,@user New iPad Apps For #SpeechTherapy And Com...,1,@user New iPad Apps For #SpeechTherapy And Com...,1
...,...,...,...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,2.0,Ipad everywhere. #SXSW {link},1,Ipad everywhere. #SXSW {link},1
9089,"Wave, buzz... RT @mention We interrupt your re...",,1.0,"Wave, buzz... RT @user We interrupt your regul...",1,"Wave, buzz... RT @user We interrupt your regul...",2
9090,"Google's Zeiger, a physician never reported po...",,1.0,"Google's Zeiger, a physician never reported po...",1,"Google's Zeiger, a physician never report pote...",1
9091,Some Verizon iPhone customers complained their...,,1.0,Some Verizon iPhone customers complained their...,0,Some Verizon iPhone customer complain their ti...,1


In [111]:
#clean_roberta.to_csv('data/roberta.csv')

In [114]:
clean_roberta['original_score'].value_counts()

1.0    5348
2.0    2248
0.0     439
Name: original_score, dtype: int64

In [115]:
clean_roberta['roberta_score'].value_counts()

2    4358
1    3233
0     598
Name: roberta_score, dtype: int64

In [116]:
clean_roberta['final_roberta_score'].value_counts()

2    4199
1    3483
0     507
Name: final_roberta_score, dtype: int64