# Sentiment Analysis - Transformers

In [18]:
#Import needed modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV

#Read in the data
tweets = pd.read_csv('../data/clean_tweets.csv', encoding = 'iso-8859-1')

In [26]:
#Functions to tokenize text
import string

#Replaces pos tags with lemmatize compatable tags
def pos_replace(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
#Makes list of punctuation to exclude, keeps certain symbols
punct = list(string.punctuation)
keep_punct = ['#', '?', '!', '@']
punct = [p for p in punct if p not in keep_punct]

#Used to filter rt
common_tweet_words = ['rt']

#Removes non-ASCII characters
def remove_junk(tweet):
    return ''.join([i if ord(i) < 128 else ' ' for i in tweet])
    
def tweet_tokenizer(doc):
    #Gets rid of links
    doc = re.sub(r'http\S+', '', doc)
    doc = re.sub(r'www\.[a-z]?\.?(com)+|[a-z]+\.(com)', '', doc)
    #Gets rid of #sxsw hashtag variations
    doc = re.sub(r'(?i)(#sxsw)\w*', '', doc)
    #Gets rid of conversions made during scraping
    doc = re.sub(r'{link}', '', doc)
    doc = re.sub(r'\[video\]', '', doc)
    #Gets rid of weird characters
    doc = remove_junk(doc)
    #Tokenizes using NLTK Twitter Tokenizer
    tweet_token = TweetTokenizer(strip_handles = True)
    doc = tweet_token.tokenize(doc)
    #Gets rid of any tokens that represent if the tweet was retweeted
    doc = [w for w in doc if w.lower() not in common_tweet_words]
    #Gets rid  of any punctuation that we don't want to keep
    doc = [w for w in doc if w not in punct]
    #Lemmatizes tokens
    doc = pos_tag(doc)
    doc = [(w[0], pos_replace(w[1])) for w in doc]
    lemmatizer = WordNetLemmatizer() 
    doc = [lemmatizer.lemmatize(word[0], word[1]) for word in doc]
    
    return doc

def clean_tweets(doc):
    #Gets rid of links
    doc = re.sub(r'http\S+', '', doc)
    doc = re.sub(r'www\.[a-z]?\.?(com)+|[a-z]+\.(com)', '', doc)
    #Gets rid of #sxsw hashtag variations
    doc = re.sub(r'(?i)(#sxsw)\w*', '', doc)
    #Gets rid of conversions made during scraping
    doc = re.sub(r'{link}', '', doc)
    doc = re.sub(r'\[video\]', '', doc)
    #Gets rid of RT and @'s
    doc = re.sub(r'RT ', '', doc)
    doc = re.sub(r'@\w+', '', doc)
    #Gets rid of weird characters
    doc = remove_junk(doc)
    return doc

In [27]:
tweets.tweet_text = tweets.tweet_text.apply(clean_tweets)

In [31]:
from transformers import pipeline

sentiment_pipeline = pipeline(model = "cardiffnlp/twitter-roberta-base-sentiment")

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=949.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=747.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=498679497.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=898822.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=456318.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=150.0), HTML(value='')))




Generate predictions based on pre-trained model with no tuning

In [39]:
prediction = tweets.tweet_text.map(lambda x: sentiment_pipeline(x))

In [52]:
results = []

for i in range(len(tweets.tweet_text)):
    if prediction[i][0]['label'][-1] == '1':
        results.append(2) #Neutral
    elif prediction[i][0]['label'][-1] == '2':
        results.append(1) #Positive
    else:
        results.append(0) #Negative

In [55]:
from sklearn.metrics import accuracy_score

accuracy_score(tweets.label, results)

0.566138540899042

For being untuned it does pretty good! The final original model has 62% accurracy, so a result of 57% isn't bad. Ideally taking this model as a based and tuning it to the dataset would output better results.

# Tuned Transformer

In [None]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification

y = tweets['label']
X = tweets.drop('label', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 213)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples['tweet_text'], truncation=True)
 
tokenized_train = X_train.map(preprocess_function, batched=True)
tokenized_test = X_test.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels = 3)

def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")
  
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import TrainingArguments, Trainer
 
repo_name = "apple-sentiment-model-play-data"
 
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)
 
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [None]:
trainer.train()
trainer.evaluate()