In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, pipeline  
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from datasets import Dataset 
import pandas as pd
import numpy as np
import torch
import evaluate 
import os 

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
current_dir = os.getcwd()
main_dir = os.path.abspath(os.path.join(current_dir, '..'))
models_dir = os.path.abspath(os.path.join(main_dir, 'models')) 
ft_model = os.path.abspath(os.path.join(models_dir, 'finetuned_model'))
output_dir = os.path.abspath(os.path.join(models_dir, 'finetuned-model-movie-review-sentiment-analysis'))
os.makedirs(output_dir, exist_ok=True)

In [4]:
model = AutoModelForSequenceClassification.from_pretrained(ft_model)
tokenizer = AutoTokenizer.from_pretrained(ft_model)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:
data = pd.read_csv('../data/data_preprocessed.csv')
dataset = Dataset.from_pandas(data)

In [6]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

In [7]:
tokenized_ds = dataset.map(tokenize_function, batched=True)
tokenized_ds = tokenized_ds.train_test_split(test_size=0.2)

small_train = tokenized_ds['train'].shuffle(seed=13).select(range(200)) 
small_test = tokenized_ds['test'].shuffle(seed=13).select(range(200))

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [8]:
training_args = TrainingArguments(
    output_dir=output_dir,
    save_strategy='epoch',
    eval_strategy='epoch',
    num_train_epochs=4,
    learning_rate=1.2090707659718472e-05, # best parameters from optimization 
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_ratio=0.09356655990374085,
    fp16=True,
    optim="adamw_8bit",
    gradient_accumulation_steps=4,
    push_to_hub=True,
    hub_strategy='end',
    load_best_model_at_end=True,
    
)

In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred 
    predictions = np.argmax(logits, axis=-1)
    
    # Evaluation metrics for classification
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average=None)
    acc = accuracy_score(labels, predictions)
    auc = roc_auc_score(labels, logits[:, 1])

    return {
       "accuracy": float(acc),
        "f1": f1.tolist(),       
        "precision": precision.tolist(),
        "recall": recall.tolist(),
        "auc": auc.tolist()
        }

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
) 

In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Auc
1,No log,0.296286,0.935,"[0.9281767955801105, 0.9406392694063926]","[0.9545454545454546, 0.9196428571428571]","[0.9032258064516129, 0.9626168224299065]",0.988443
2,No log,0.233832,0.95,"[0.946236559139785, 0.9532710280373832]","[0.946236559139785, 0.9532710280373832]","[0.946236559139785, 0.9532710280373832]",0.988544
3,No log,0.264143,0.94,"[0.9347826086956522, 0.9444444444444444]","[0.945054945054945, 0.9357798165137615]","[0.9247311827956989, 0.9532710280373832]",0.98784
4,No log,0.262721,0.94,"[0.9347826086956522, 0.9444444444444444]","[0.945054945054945, 0.9357798165137615]","[0.9247311827956989, 0.9532710280373832]",0.98779


Non-default generation parameters: {'max_length': 512}
Non-default generation parameters: {'max_length': 512}
Non-default generation parameters: {'max_length': 512}
Non-default generation parameters: {'max_length': 512}


TrainOutput(global_step=52, training_loss=0.1440546329204853, metrics={'train_runtime': 628.8041, 'train_samples_per_second': 1.272, 'train_steps_per_second': 0.083, 'total_flos': 228072041808768.0, 'train_loss': 0.1440546329204853, 'epoch': 4.0})

In [12]:
trainer.evaluate()

{'eval_loss': 0.23383209109306335,
 'eval_accuracy': 0.95,
 'eval_f1': [0.946236559139785, 0.9532710280373832],
 'eval_precision': [0.946236559139785, 0.9532710280373832],
 'eval_recall': [0.946236559139785, 0.9532710280373832],
 'eval_auc': 0.9885438649381971,
 'eval_runtime': 30.4907,
 'eval_samples_per_second': 6.559,
 'eval_steps_per_second': 1.64,
 'epoch': 4.0}

In [13]:
trainer.push_to_hub()

Non-default generation parameters: {'max_length': 512}


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.84k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/lemonsterpie/finetuned-model-movie-review-sentiment-analysis/commit/859997d4e30e676292e8d66acae2bd5c679fc9ab', commit_message='End of training', commit_description='', oid='859997d4e30e676292e8d66acae2bd5c679fc9ab', pr_url=None, repo_url=RepoUrl('https://huggingface.co/lemonsterpie/finetuned-model-movie-review-sentiment-analysis', endpoint='https://huggingface.co', repo_type='model', repo_id='lemonsterpie/finetuned-model-movie-review-sentiment-analysis'), pr_revision=None, pr_num=None)

In [14]:
# Testing my model 

tester = pd.read_csv('../data/tester_data.csv') # Preprocessed data from step 1 
test1 = tester.head(10).copy() 
test1_list = test1['preprocessed_text'].tolist()

In [15]:
my_model = pipeline('text-classification', model='lemonsterpie/finetuned-model-movie-review-sentiment-analysis')
test1_predictions = my_model(test1_list)

config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [16]:
test1.loc[:, "predicted_label"] = [p["label"] for p in test1_predictions]
test1.loc[:, "confidence_score"] = [p["score"] for p in test1_predictions]
test1['predicted_label'] = test1['predicted_label'].map({'POS': 1, 'NEG': 0})

In [17]:
test1

Unnamed: 0,text,label,preprocessed_text,predicted_label,confidence_score
0,I love sci-fi and am willing to put up with a ...,0,i love sci fi and am willing to put up with a ...,0,0.998382
1,"Worth the entertainment value of a rental, esp...",0,worth the entertainment value of a rental esp...,0,0.998148
2,its a totally average film with a few semi-alr...,0,its a totally average film with a few semi alr...,0,0.99832
3,STAR RATING: ***** Saturday Night **** Friday ...,0,star rating saturday night friday ...,0,0.998932
4,"First off let me say, If you haven't enjoyed a...",0,first off let me say if you havent enjoyed a ...,1,0.99749
5,I had high hopes for this one until they chang...,0,i had high hopes for this one until they chang...,0,0.998872
6,Isaac Florentine has made some of the best wes...,0,isaac florentine has made some of the best wes...,0,0.997942
7,"It actually pains me to say it, but this movie...",0,it actually pains me to say it but this movie...,0,0.998562
8,"Technically I'am a Van Damme Fan, or I was. th...",0,technically iam a van damme fan or i was thi...,0,0.998766
9,"Honestly awful film, bad editing, awful lighti...",0,honestly awful film bad editing awful lighti...,0,0.998717


In [18]:
# Inspecting the mismatched prediction 

mismatch1 = test1[test1['label'] != test1['predicted_label']]
mismatch1.iloc[0,2] 

'first off let me say  if you havent enjoyed a van damme movie since bloodsport  you probably will not like this movie  most of these movies may not have the best plots or best actors but i enjoy these kinds of movies for what they are  this movie is much better than any of the movies the other action guys  segal and dolph  have thought about putting out the past few years  van damme is good in the movie  the movie is only worth watching to van damme fans  it is not as good as wake of death  which i highly recommend to anyone of likes van damme  or in hell but  in my opinion its worth watching  it has the same type of feel to it as nowhere to run  good fun stuff '

In [19]:
# Testing another batch 

test2 = tester.tail(10).copy()
test2_list = test2['preprocessed_text'].tolist()
test2_predictions = my_model(test2_list)

In [20]:
test2.loc[:, "predicted_label"] = [p["label"] for p in test2_predictions]
test2.loc[:, "confidence_score"] = [p["score"] for p in test2_predictions]
test2['predicted_label'] = test2['predicted_label'].map({'POS': 1, 'NEG': 0})

In [21]:
test2

Unnamed: 0,text,label,preprocessed_text,predicted_label,confidence_score
24990,I first saw this on Demand. Or on TV. I'm not ...,1,i first saw this on demand or on tv im not r...,1,0.998639
24991,In the veins of Jeepers Creepers and The Texas...,1,in the veins of jeepers creepers and the texas...,1,0.997644
24992,Great horror comedy from Michael Davis.Iwas la...,1,great horror comedy from michael davis iwas la...,1,0.998805
24993,Two city guys are driving through Hicksville U...,1,two city guys are driving through hicksville u...,1,0.995993
24994,This is a surprisingly great low budget Horror...,1,this is a surprisingly great low budget horror...,1,0.999247
24995,Just got around to seeing Monster Man yesterda...,1,just got around to seeing monster man yesterda...,1,0.998298
24996,I got this as part of a competition prize. I w...,1,i got this as part of a competition prize i w...,1,0.997778
24997,I got Monster Man in a box set of three films ...,1,i got monster man in a box set of three films ...,1,0.998848
24998,"Five minutes in, i started to feel how naff th...",1,five minutes in i started to feel how naff th...,1,0.997703
24999,I caught this movie on the Sci-Fi channel rece...,1,i caught this movie on the sci fi channel rece...,1,0.997998


No mismatches for test 2! 