# Sentiment Analysis

In [1]:
import pandas as pd

splits = {'train': 'train_df.csv', 'validation': 'val_df.csv', 'test': 'test_df.csv'}
dataframes = []
for split, filename in splits.items():
    if filename != 'val_df.csv':
      df = pd.read_csv(f'hf://datasets/Sp1786/multiclass-sentiment-analysis-dataset/{filename}')
      df.drop(['id', 'label'], axis=1, inplace=True)
      dataframes.append(df)
    else:
      test_df = pd.read_csv(f'hf://datasets/Sp1786/multiclass-sentiment-analysis-dataset/{filename}')
      test_df.drop(['id', 'label'], axis=1, inplace=True)
train_df = pd.concat(dataframes)
train_df = train_df.dropna()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [2]:
print(train_df.head())
print(test_df.head())

print(len(train_df))
print(len(test_df))

                                                text sentiment
0                    Cooking microwave pizzas, yummy  positive
1  Any plans of allowing sub tasks to show up in ...   neutral
2   I love the humor, I just reworded it. Like sa...  positive
3                       naw idk what ur talkin about   neutral
4          That sucks to hear. I hate days like that  negative
                                                text sentiment
0  Laying in bed til workkk... Oh the life. Defin...  negative
1   ooohhh imma need you to get on that asap love...  positive
2   Thanks! I love it they have a video, so you d...  positive
3     I left my ipod in the car so now its all warm.  positive
4  Great app. Only complaint is that I'd like the...  positive
36437
5205


## Train the Model

In [3]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.45.1-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit->simpletransformers)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [

In [4]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

model_args = ClassificationArgs()

model_args.overwrite_output_dir=True
model_args.evaluate_during_training=True
model_args.no_save = True

model_args.max_seq_length=256
model_args.use_early_stopping=True
model_args.early_stopping_delta=0.01
model_args.early_stopping_metric='eval_loss'
model_args.early_stopping_metric_minimize=True
model_args.early_stopping_patience=2
model_args.evaluate_during_training_steps=32
model_args.reprocess_input_data=True
model_args.manual_seed=4
model_args.use_multiprocessing=True
model_args.labels_list=["neutral", "positive", "negative"]
model_args.wandb_project="Sentiment-Analysis-Sweep"

In [5]:
import wandb

def train_model():
  wandb.init()
  model = ClassificationModel("roberta", "roberta-base", num_labels=3, args=model_args, use_cuda=True, sweep_config=wandb.config)
  model.train_model(train_df, eval_df=test_df)

In [6]:
wandb.login()

sweep_configuration = {
	"method": "grid",
	"metric": {"goal": "minimize", "name": "eval_loss"},
	"parameters": {
		"train_epochs": {"values": [8, 10, 12, 16]},
		"train_batch_size": {"values": [16, 32, 64]},
		"learning_rate": {"values": [1e-6, 5e-5, 1e-5, 5e-4]},
	}
}
sweep_id = input("Please enter an existing sweep id if you want to continue an existing sweep (leave blank for a new sweep): ")
sweep_id = None if sweep_id == "" else sweep_id

if sweep_id is None:
	sweep_id = wandb.sweep(sweep=sweep_configuration, project="Sentiment-Analysis-Sweep")

wandb.agent(sweep_id, function=train_model, project='Sentiment-Analysis-Sweep')

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msimonas-mickus[0m ([33md3dsec[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Please enter an existing sweep id if you want to continue an existing sweep (leave blank for a new sweep): 2qk2z5y6


[34m[1mwandb[0m: Agent Starting Run: ig4bf2fu with config:
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	train_batch_size: 64
[34m[1mwandb[0m: 	train_epochs: 12


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



  0%|          | 0/72 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 1:   0%|          | 0/570 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():


0,1
Training loss,█▂▃▅▁▃▃
eval_loss,█▆▃▂▂▂▁▁▁▁▁▁
global_step,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇█
lr,█▇▆▅▃▂▁
mcc,▁▆▇▇████████
train_loss,█▇▃▂▂▃▂▃▃▁▁▂

0,1
Training loss,0.7359
eval_loss,0.59244
global_step,384.0
lr,0.0
mcc,0.63079
train_loss,0.61342


[34m[1mwandb[0m: Agent Starting Run: w3t3r4f1 with config:
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	train_batch_size: 16
[34m[1mwandb[0m: 	train_epochs: 10


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/72 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 1:   0%|          | 0/2278 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():


0,1
Training loss,▁██
eval_loss,▅█▁▂▂▃
global_step,▁▂▂▄▄▅▆▇█
lr,█▄▁
mcc,▁▁▁▁▁▁
train_loss,█▁▄▃▄▁

0,1
Training loss,1.10455
eval_loss,1.13222
global_step,192.0
lr,0.00047
mcc,0.0
train_loss,1.03592


[34m[1mwandb[0m: Agent Starting Run: 1msqc851 with config:
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	train_batch_size: 16
[34m[1mwandb[0m: 	train_epochs: 16


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/72 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 1:   0%|          | 0/2278 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():


0,1
Training loss,▁██
eval_loss,▅█▁▂▂▃
global_step,▁▂▂▄▄▅▆▇█
lr,█▄▁
mcc,▁▁▁▁▁▁
train_loss,█▁▄▃▄▁

0,1
Training loss,1.10455
eval_loss,1.13222
global_step,192.0
lr,0.00047
mcc,0.0
train_loss,1.03592


[34m[1mwandb[0m: Agent Starting Run: jz01k47o with config:
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	train_epochs: 12


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/72 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 1:   0%|          | 0/1139 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():


0,1
Training loss,▅▁█▄
eval_loss,█▆▅▁▅▁▃
global_step,▁▂▂▃▃▅▅▆▇▇█
lr,█▆▃▁
mcc,▁▁▁▁▁▁▁
train_loss,▃█▁▅▇▅▄

0,1
Training loss,1.10066
eval_loss,1.11805
global_step,224.0
lr,0.00043
mcc,0.0
train_loss,1.07967


[34m[1mwandb[0m: Agent Starting Run: 3d7swm13 with config:
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	train_batch_size: 64
[34m[1mwandb[0m: 	train_epochs: 16


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/72 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 1:   0%|          | 0/570 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():
  with amp.autocast():


  0%|          | 0/10 [00:00<?, ?it/s]

  with amp.autocast():


0,1
Training loss,█▆▁
eval_loss,█▁▁▁▂
global_step,▁▂▃▅▅▆▇█
lr,█▄▁
mcc,█▁▁▁▁
train_loss,█▁▄▃▂

0,1
Training loss,1.07015
eval_loss,1.10271
global_step,160.0
lr,0.00039
mcc,0.0
train_loss,1.10291


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


In [6]:
eval_df = pd.read_csv('./sentiment-topic-test.tsv', sep='\t')
eval_df = eval_df.dropna()
eval_df.drop(['sentence_id, topic'], axis=1)

model_args.num_train_epochs=16
model_args.train_batch_size=64
model_args.learning_rate=5e-5

model = ClassificationModel("roberta", "roberta-base", num_labels=3, args=model_args, use_cuda=True)

In [None]:
from sklearn.metrics import classification_report

predict, probabilities = model.predict(eval_df['text'].tolist())
eval_df['predicted'] = predict
print(classification_report(eval_df['sentiment'], eval_df['predicted']))