## Simple Transformer Sample - Emotion Regression

<b>  Tool: https://simpletransformers.ai/docs/regression/. First we will load the training and testing data and perform some preprocessing</b>

In [1]:
#!pip install simpletransformers scikit-learn
import pandas as pd
#from google.colab import files
#uploaded = files.upload()

In [2]:
# Read the text files:
train_file = 'EI-reg-En-anger-train.txt'
test_file = '2018-EI-reg-En-anger-test-gold.txt'
# Loading the training and testing data using pd, separate with tab:
train_data = pd.read_csv(train_file, sep='\t')
test_data = pd.read_csv(test_file, sep='\t')

In [3]:
# Displaying the first few rows of the training and testing data
train_data.head(), test_data.head()

(              ID                                              Tweet  \
 0  2017-En-10264  @xandraaa5 @amayaallyn6 shut up hashtags are c...   
 1  2017-En-10072  it makes me so fucking irate jesus. nobody is ...   
 2  2017-En-11383         Lol Adam the Bull with his fake outrage...   
 3  2017-En-11102  @THATSSHAWTYLO passed away early this morning ...   
 4  2017-En-11506  @Kristiann1125 lol wow i was gonna say really?...   
 
   Affect Dimension  Intensity Score  
 0            anger            0.562  
 1            anger            0.750  
 2            anger            0.417  
 3            anger            0.354  
 4            anger            0.438  ,
               ID                                              Tweet  \
 0  2018-En-02328  @PageShhh1 I know you mean well but I'm offend...   
 1  2018-En-02617  Let go of resentment, it will hold you back, d...   
 2  2018-En-01021  No, I'm not 'depressed because of the weather,...   
 3  2018-En-03737  #AmarnathTerrorAttack  M

In [4]:
#Import libraries needed:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
## Add de-emoji tool:
import emoji

In [5]:
# Set of English stop words
stop_words = set(stopwords.words('english'))

In [6]:
## Comment out any of the functions to not use them:
#We use the pre-process function from previous exercise with english nltk packs:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove usernames
    text = re.sub(r'@\w+', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove special characters and punctuation
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize the text
    word_tokens = word_tokenize(text)
    # Remove stop words
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

In [7]:
#Remove emojis and replace them with raw_text:
# Source: https://medium.com/@sarahisdevs/convert-emoji-into-text-in-python-c2afdfd94ab4
def remove_emoji_text(filtered_text):
    text_demojid = emoji.demojize(filtered_text)
    return text_demojid

In [8]:
# Preprocessing the tweet text
train_data['Tweet'] = train_data['Tweet'].apply(preprocess_text)
test_data['Tweet'] = test_data['Tweet'].apply(preprocess_text)

# Remove emojis:
train_data['Tweet'] = train_data['Tweet'].apply(remove_emoji_text)
test_data['Tweet'] = test_data['Tweet'].apply(remove_emoji_text)

# Displaying the first few rows of the training and testing data
train_data.head(), test_data.head()

(              ID                                              Tweet  \
 0  2017-En-10264                        shut hashtags cool offended   
 1  2017-En-10072  makes fucking irate jesus nobody calling ppl l...   
 2  2017-En-11383                         lol adam bull fake outrage   
 3  2017-En-11102  passed away early morning fast furious styled ...   
 4  2017-En-11506  lol wow gon na say really haha seen chris nah ...   
 
   Affect Dimension  Intensity Score  
 0            anger            0.562  
 1            anger            0.750  
 2            anger            0.417  
 3            anger            0.354  
 4            anger            0.438  ,
               ID                                              Tweet  \
 0  2018-En-02328                      know mean well offended prick   
 1  2018-En-02617  let go resentment hold back worry could come m...   
 2  2018-En-01021  depressed weather depressed depression sicknot...   
 3  2018-En-03737  amarnathterrorattack mus

In [None]:
## Added remaining essential import libraries/APIs:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import logging
import warnings
import torch
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [10]:
## Check if GPU is available:
print(torch.cuda.is_available())

#Re-use the device CPU or GPU checker:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('GPU or CPU (device) being used:', device)

False
GPU or CPU (device) being used: cpu


In [11]:
warnings.filterwarnings('ignore')

In [12]:
# Configure logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [13]:
# Enabling regression and setting (optional) model configuration
model_args = ClassificationArgs()
model_args.num_train_epochs = 3
model_args.regression = True
model_args.overwrite_output_dir = True

In [14]:
# Create a ClassificationModel
# Model selected: roberta-base
## Source: https://simpletransformers.ai/docs/classification-models/
## Source: https://huggingface.co/roberta-base
model = ClassificationModel(
    "roberta",
    "roberta-base",
    num_labels=1,
    args=model_args,
    use_cuda=torch.cuda.is_available()
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Prepare the data
train_df = train_data[['Tweet', 'Intensity Score']]
train_df.columns = ["text", "labels"]

eval_df = test_data[['Tweet', 'Intensity Score']]
eval_df.columns = ["text", "labels"]

In [16]:
# Train the model
model.train_model(train_df)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/1701 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_roberta_128_1_2


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/213 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/213 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/213 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


(639, 0.028343388759195863)

In [17]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/1002 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_1_2


Running Evaluation:   0%|          | 0/126 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'eval_loss': 0.018400745367028914}


In [18]:
# Extract actual and predicted values
actual = eval_df['labels'].values
predicted = model_outputs.reshape(-1)

In [19]:
# Compute additional metrics
mse = mean_squared_error(actual, predicted)
mae = mean_absolute_error(actual, predicted)
r2 = r2_score(actual, predicted)

# Print results
print(f"Results: {result}")
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Results: {'eval_loss': 0.018400745367028914}
Mean Squared Error: 0.01850784470450811
Mean Absolute Error: 0.10763236956801005
R-squared: 0.48428323728850853


In [20]:
### Finished ###