# Sentiment Analysis

In [None]:
import pandas as pd

splits = {'train': 'train_df.csv', 'validation': 'val_df.csv', 'test': 'test_df.csv'}
dataframes = []
for split, filename in splits.items():
    if filename != 'val_df.csv':
      df = pd.read_csv(f'hf://datasets/Sp1786/multiclass-sentiment-analysis-dataset/{filename}')
      df.drop(['id', 'label'], axis=1, inplace=True)
      dataframes.append(df)
    else:
      test_df = pd.read_csv(f'hf://datasets/Sp1786/multiclass-sentiment-analysis-dataset/{filename}')
      test_df.drop(['id', 'label'], axis=1, inplace=True)
train_df = pd.concat(dataframes)
train_df = train_df.dropna()

In [None]:
print(train_df.head())
print(test_df.head())

print(len(train_df))
print(len(test_df))

## Train the Model

In [None]:
!pip install simpletransformers

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

model_args = ClassificationArgs()

model_args.overwrite_output_dir=True
model_args.evaluate_during_training=True
model_args.no_save = True

model_args.max_seq_length=256
model_args.use_early_stopping=True
model_args.early_stopping_delta=0.01
model_args.early_stopping_metric='eval_loss'
model_args.early_stopping_metric_minimize=True
model_args.early_stopping_patience=2
model_args.evaluate_during_training_steps=32
model_args.reprocess_input_data=True
model_args.manual_seed=4
model_args.use_multiprocessing=True
model_args.labels_list=["neutral", "positive", "negative"]
model_args.wandb_project="Sentiment-Analysis-Sweep"

In [None]:
import wandb

def train_model():
  wandb.init()
  model = ClassificationModel("roberta", "roberta-base", num_labels=3, args=model_args, use_cuda=True, sweep_config=wandb.config)
  model.train_model(train_df, eval_df=test_df)

In [None]:
wandb.login()

sweep_configuration = {
	"method": "grid",
	"metric": {"goal": "minimize", "name": "eval_loss"},
	"parameters": {
		"train_epochs": {"values": [8, 10, 12, 16]},
		"train_batch_size": {"values": [16, 32, 64]},
		"learning_rate": {"values": [1e-6, 5e-5, 1e-5, 5e-4]},
	}
}
sweep_id = input("Please enter an existing sweep id if you want to continue an existing sweep (leave blank for a new sweep): ")
sweep_id = None if sweep_id == "" else sweep_id

if sweep_id is None:
	sweep_id = wandb.sweep(sweep=sweep_configuration, project="Sentiment-Analysis-Sweep")

wandb.agent(sweep_id, function=train_model, project='Sentiment-Analysis-Sweep')

In [None]:
best_model_args = ClassificationArgs()

best_model_args.overwrite_output_dir=True
best_model_args.evaluate_during_training=True
best_model_args.no_save = True

best_model_args.max_seq_length=256
best_model_args.use_early_stopping=True
best_model_args.early_stopping_delta=0.01
best_model_args.early_stopping_metric='eval_loss'
best_model_args.early_stopping_metric_minimize=True
best_model_args.early_stopping_patience=2
best_model_args.evaluate_during_training_steps=32
best_model_args.reprocess_input_data=True
best_model_args.manual_seed=25
best_model_args.use_multiprocessing=True
best_model_args.labels_list=["neutral", "positive", "negative"]


In [None]:
eval_df = pd.read_csv('./sentiment-topic-test.tsv', sep='\t')
eval_df = eval_df.dropna()
eval_df.drop(['sentence_id', 'topic'], axis=1, inplace=True)
eval_df.columns = ['text', 'sentiment']
print(eval_df.head())

best_model_args.num_train_epochs=16
best_model_args.train_batch_size=64
best_model_args.learning_rate=5e-5

best_model = ClassificationModel("roberta", "roberta-base", num_labels=3, args=best_model_args, use_cuda=True)

In [None]:
best_model.train_model(pd.concat([train_df, test_df]), eval_df=eval_df)

In [None]:
from sklearn.metrics import classification_report

predict, probabilities = best_model.predict(eval_df['text'].tolist())
eval_df['predicted'] = predict
print(classification_report(eval_df['sentiment'], eval_df['predicted']))