# Experimenting with various models

# Experimenting using the Support Vector Machine algorithm and Bag of Words representation

I will be using a clean and labeled tweet dataset from Kaggle. It has just over 27k tweets labeled with sentiment.

Before implementing my main roBERTa model for twitter sentiment analysis, it is always good to test other more traditional machine learning methods.

One really good method is Support Vector Machine (SVM). I really like how it extends into higher dimensions to find groupings. And I will be using the simple Bag of Words representation on the 27k tweets.


After that, I will be implement roBERTa with no fine tuning to comapre the difference in accuracy.


In [12]:

# import pandas as pd
# import re


import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import time
import re

df = pd.read_csv("dataset-labled-tweets-original.csv")

print(df.head())

print(df.shape)

       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text sentiment  
0  I`d have responded, if I were going   neutral  
1                             Sooo SAD  negative  
2                          bullying me  negative  
3                       leave me alone  negative  
4                        Sons of ****,  negative  
(27481, 4)


In [6]:
# cleaning tweets for sentiment analysis
import re

def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)|😉', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)

    return tweet

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002500-\U00002BEF"  # chinese char
                           u"\U00002702-\U000027B0"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001f926-\U0001f937"
                           u"\U00010000-\U0010ffff"
                           u"\u2640-\u2642"
                           u"\u2600-\u2B55"
                           u"\u200d"
                           u"\u23cf"
                           u"\u23e9"
                           u"\u231a"
                           u"\ufe0f"  # dingbats
                           u"\u3030"
                           "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', string)


import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

def preprocess_word(word):
    # Tokenize the word
    tokens = nltk.word_tokenize(word)
    
    # Convert to lowercase
    tokens = [token.lower() for token in tokens]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stem the tokens
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join the tokens back into a single string
    preprocessed_word = ' '.join(tokens)
    
    return preprocessed_word



def preprocess_tweet(tweet):
    processed_tweet = []
    # Convert to lower case
    tweet = tweet.lower()
    
    #Clean only digits
    tweet = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", tweet)
    
    # Replaces URLs with the word URL
    #tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
    tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', '', tweet)
    
    # Replace @handle with the word USER_MENTION
    #tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
    tweet = re.sub(r'@[\S]+', '', tweet)
    
    # Replaces #hashtag with hashtag
    #tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
    tweet = re.sub(r'#(\S+)', '', tweet)
    
    # Remove RT (retweet)
    tweet = re.sub(r'\brt\b', '', tweet)
    
    # Replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    
    # Strip space, " and ' from tweet
    tweet = tweet.strip(' "\'')

    # Replace emojis with either EMO_POS or EMO_NEG
    #tweet = handle_emojis(tweet)
    tweet = remove_emoji(tweet)
   
    # Replace multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet)

    #my custom chars
    tweet = tweet.replace('₺','')
    tweet = tweet.replace('=','')
    tweet = tweet.replace('’','')
    tweet = tweet.replace('|','')
    tweet = tweet.replace('‘','')
    tweet = tweet.replace('/','')
    tweet = tweet.replace('…','')
    tweet = tweet.replace('–','')
    tweet = tweet.replace('&','')
    tweet = tweet.replace('“','')
    tweet = tweet.replace('”','')
    tweet = tweet.replace('+','')
    tweet = tweet.replace('%','')
    tweet = tweet.replace('@','')
    tweet = tweet.replace('#','')

    words = word_tokenize(tweet) #tweet.split()

    for word in words:
      word = preprocess_word(word)
      #if is_valid_word(word):
      #    processed_tweet.append(word)
      processed_tweet.append(word)

    return ' '.join(processed_tweet)


from nltk import word_tokenize

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

def clean_tweet(tweet):
    tweet = str(tweet)
    handle_emojis(tweet)
    remove_emoji(tweet)
    preprocess_tweet(tweet)
    return str(tweet) 




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jk\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [15]:
df["clean_tweet"] = df['text'].apply(clean_tweet) # applies clean_tweet function on each record of 'text' and creates a new column 'clean_tweet'

In [16]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,clean_tweet
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"I`d have responded, if I were going"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,Sooo SAD I will miss you here in San Diego!!!
2,088c60f138,my boss is bullying me...,bullying me,negative,my boss is bullying me...
3,9642c003ef,what interview! leave me alone,leave me alone,negative,what interview! leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"Sons of ****, why couldn`t they put them on t..."


In [17]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)
test.head

<bound method NDFrame.head of            textID                                               text  \
4257   2dc2b2ecaf                        i love you <3  host me lmao   
11686  a3f50ce6ea  My guitar ain`t here yet?, feel like i lost a ...   
6320   b58106c8bd                            This world makes me sad   
22062  b8eb46f4c4                                    ill buy you one   
21238  e7a0cfb127  soo tired.. still kinda angry that i missed th...   
...           ...                                                ...   
22979  29cb8acb08   Aww ya not showing off all us mums should be ...   
19270  78e0057a31  My dog is suffering from abandonment issues. S...   
11611  cfe1e3bf7e            ahhh twitter, I havent seen you all day   
17542  7f3951027f   cool. my **** itch. got sunburned at the volcano   
23554  a3e4471cef           Working at hop city. Gotta miss baseball   

                                           selected_text sentiment  \
4257                               

In [63]:

vectorizer = CountVectorizer(max_features=10000)
BOW = vectorizer.fit_transform(df['clean_tweet'])


from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(BOW,np.asarray(df["sentiment"]))




from sklearn.svm import SVC
model = SVC()
model.fit(x_train,y_train)


In [64]:
predictions = model.predict(x_test)

In [23]:
from sklearn.metrics import accuracy_score,confusion_matrix

print("Accuracy of model is {}%".format(accuracy_score(y_test,predictions) * 100))

Accuracy of model is 69.04380730606898%


# Comparison to RoBERTa

In [8]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import re

In [2]:
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

print(f"\n\nMODEL string: {MODEL}\n\n")
tokenizer = AutoTokenizer.from_pretrained(MODEL)



MODEL string: cardiffnlp/twitter-roberta-base-sentiment




In [3]:
# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [4]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [9]:
# Testing individual tweets

text = "$zm ended the day at the price of $90.95"
text = clean_tweet(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

print(f"Results on text: {text}")


ranking = np.argsort(scores)

print("ranking: " + str(ranking))
ranking = ranking[::-1]
print("ranking: " + str(ranking))
for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")


 

Results on text: $zm ended the day at the price of $90.95
ranking: [0 2 1]
ranking: [1 2 0]
1) neutral 0.896
2) positive 0.0624
3) negative 0.0417


In [13]:

MODEL_R = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model_r = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [18]:
df['clean_tweet'].head()

0                  I`d have responded, if I were going
1        Sooo SAD I will miss you here in San Diego!!!
2                            my boss is bullying me...
3                       what interview! leave me alone
4     Sons of ****, why couldn`t they put them on t...
Name: clean_tweet, dtype: object

In [19]:

predictions = []

for text in df['clean_tweet']:
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model_r(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    predicted_label = config.id2label[ranking[0]]
    predictions.append(predicted_label)

    # this was interrupted as the task was taking too long, but we did get 17k sentiments from RoBERTa.

KeyboardInterrupt: 

In [76]:
df_predictions = pd.DataFrame(predictions, columns=['sentiment_r'])

df_predictions.to_csv('17k_sentiment_r.csv')

In [77]:
# adding 17k sentiment_r to df and comparing them.

df_predictions_r_series = pd.Series(predictions,  name='sentiment_r')

df = df.assign(prediction_r = df_predictions_r_series)


In [83]:
df[['clean_tweet','sentiment', 'prediction_r']].head(30)

Unnamed: 0,clean_tweet,sentiment,prediction_r
0,"I`d have responded, if I were going",neutral,neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative,negative
2,my boss is bullying me...,negative,negative
3,what interview! leave me alone,negative,negative
4,"Sons of ****, why couldn`t they put them on t...",negative,negative
5,http://www.dothebouncy.com/smf - some shameles...,neutral,neutral
6,2am feedings for the baby are fun when he is a...,positive,positive
7,Soooo high,neutral,positive
8,Both of you,neutral,neutral
9,Journey!? Wow... u just became cooler. hehe....,positive,positive


In [86]:
# df_17k = df['prediction_r' != NaN]
df_17k = df[df['prediction_r'].notna()]
df_17k.tail(30)

Unnamed: 0,textID,text,selected_text,sentiment,clean_tweet,prediction_r
17886,6ac6671968,"Heh heh heh, come on! It is a THQ release! Ah...","Heh heh heh, come on! It is a THQ release! Ah ...",neutral,"Heh heh heh, come on! It is a THQ release! Ah...",positive
17887,578d6c1bfe,"My friend Cliff has the tix, so I have to wai...","My friend Cliff has the tix, so I have to wait...",neutral,"My friend Cliff has the tix, so I have to wai...",neutral
17888,290a0ffa68,haha that`s because you also look amazing in ...,haha that`s because you also look amazing in i...,positive,haha that`s because you also look amazing in ...,positive
17889,073f1f3c0c,_Stained http://twitpic.com/4jhe5 - I LOVE it!...,LOVE,positive,_Stained http://twitpic.com/4jhe5 - I LOVE it!...,positive
17890,c56f034be1,ive finished them now,ive finished them now,neutral,ive finished them now,neutral
17891,05ed6c49d8,sorry to hear about your dog,sorry,negative,sorry to hear about your dog,negative
17892,ff61f3369a,"After a week staying with my Grandmother, I`m ...","After a week staying with my Grandmother, I`m ...",neutral,"After a week staying with my Grandmother, I`m ...",neutral
17893,e1cf6a99a1,since I`m reading the Twilight series and wat...,.perfect,positive,since I`m reading the Twilight series and wat...,positive
17894,eb0e2ed274,Bummed out I am missing a rock climbing trip n...,missing,negative,Bummed out I am missing a rock climbing trip n...,negative
17895,7ccc3bafa7,lamentablemente paso #jrztwitterlunch,lamentablemente paso #jrztwitterlunch,neutral,lamentablemente paso #jrztwitterlunch,neutral


In [87]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(df_17k['sentiment'], df_17k['prediction_r'])

print(f"RoBERTa accuracy: {accuracy}")

RoBERTa accuracy: 0.7065193123465059


Note that accuracy for RoBERTa is about 0.70 which is not much better than SVM. But I beleive with more fine tuning, RoBERTa will much better outperform SVM. 

Thus I believe continuing with the RoBERTa model is the right choice.