In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, pipeline  
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from datasets import Dataset 
import pandas as pd
import numpy as np
import torch
import evaluate 
import os 

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
current_dir = os.getcwd()
main_dir = os.path.abspath(os.path.join(current_dir, '..'))
models_dir = os.path.abspath(os.path.join(main_dir, 'models')) 
ft_model = os.path.abspath(os.path.join(models_dir, 'finetuned_model'))
output_dir = os.path.abspath(os.path.join(models_dir, 'finetuned-model-movie-review-sentiment-analysis'))
os.makedirs(output_dir, exist_ok=True)

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(ft_model)
tokenizer = AutoTokenizer.from_pretrained(ft_model)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
data = pd.read_csv('../data/data_preprocessed.csv')
dataset = Dataset.from_pandas(data)

In [8]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

In [9]:
tokenized_ds = dataset.map(tokenize_function, batched=True)
tokenized_ds = tokenized_ds.train_test_split(test_size=0.2)

small_train = tokenized_ds['train'].shuffle(seed=13).select(range(200))
small_test = tokenized_ds['test'].shuffle(seed=13).select(range(200))

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [10]:
training_args = TrainingArguments(
    output_dir=output_dir,
    save_strategy='epoch',
    eval_strategy='epoch',
    num_train_epochs=1,
    learning_rate=3.2306249035382145e-05, # best parameters from optimization 
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    push_to_hub=True,
    hub_strategy='end'
)

In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred 
    predictions = np.argmax(logits, axis=-1)
    
    # Evaluation metrics for classification
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average=None)
    acc = accuracy_score(labels, predictions)
    auc = roc_auc_score(labels, logits[:, 1])

    return {
       "accuracy": float(acc),
        "f1": f1.tolist(),       
        "precision": precision.tolist(),
        "recall": recall.tolist(),
        "auc": auc.tolist()
        }

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
) 

In [13]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Auc
1,No log,0.317774,0.925,"[0.9101796407185628, 0.9356223175965666]","[1.0, 0.8790322580645161]","[0.8351648351648352, 1.0]",0.977014


Non-default generation parameters: {'max_length': 512}


TrainOutput(global_step=100, training_loss=0.3897837066650391, metrics={'train_runtime': 903.2793, 'train_samples_per_second': 0.221, 'train_steps_per_second': 0.111, 'total_flos': 43167431664000.0, 'train_loss': 0.3897837066650391, 'epoch': 1.0})

In [14]:
trainer.evaluate()



{'eval_loss': 0.31777361035346985,
 'eval_accuracy': 0.925,
 'eval_f1': [0.9101796407185628, 0.9356223175965666],
 'eval_precision': [1.0, 0.8790322580645161],
 'eval_recall': [0.8351648351648352, 1.0],
 'eval_auc': 0.9770138118761972,
 'eval_runtime': 189.0869,
 'eval_samples_per_second': 1.058,
 'eval_steps_per_second': 0.529,
 'epoch': 1.0}

In [15]:
trainer.push_to_hub()

Non-default generation parameters: {'max_length': 512}


model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.84k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/lemonsterpie/finetuned-model-movie-review-sentiment-analysis/commit/a43302ac129c7f43e7b01f424bd56e7468c7d775', commit_message='End of training', commit_description='', oid='a43302ac129c7f43e7b01f424bd56e7468c7d775', pr_url=None, repo_url=RepoUrl('https://huggingface.co/lemonsterpie/finetuned-model-movie-review-sentiment-analysis', endpoint='https://huggingface.co', repo_type='model', repo_id='lemonsterpie/finetuned-model-movie-review-sentiment-analysis'), pr_revision=None, pr_num=None)

In [16]:
# Testing my model 

tester = pd.read_csv('../data/tester_data.csv') # Preprocessed data from step 1 
test1 = tester.head(10).copy() 
test1_list = test1['preprocessed_text'].tolist()

In [17]:
my_model = pipeline('text-classification', model='lemonsterpie/finetuned-model-movie-review-sentiment-analysis')
test1_predictions = my_model(test1_list)

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

Device set to use cpu


In [18]:
test1.loc[:, "predicted_label"] = [p["label"] for p in test1_predictions]
test1.loc[:, "confidence_score"] = [p["score"] for p in test1_predictions]
test1['predicted_label'] = test1['predicted_label'].map({'POS': 1, 'NEG': 0})

In [19]:
test1

Unnamed: 0,text,label,preprocessed_text,predicted_label,confidence_score
0,I love sci-fi and am willing to put up with a ...,0,i love sci fi and am willing to put up with a ...,0,0.999808
1,"Worth the entertainment value of a rental, esp...",0,worth the entertainment value of a rental esp...,0,0.999215
2,its a totally average film with a few semi-alr...,0,its a totally average film with a few semi alr...,0,0.999812
3,STAR RATING: ***** Saturday Night **** Friday ...,0,star rating saturday night friday ...,0,0.999818
4,"First off let me say, If you haven't enjoyed a...",0,first off let me say if you havent enjoyed a ...,1,0.999435
5,I had high hopes for this one until they chang...,0,i had high hopes for this one until they chang...,0,0.999826
6,Isaac Florentine has made some of the best wes...,0,isaac florentine has made some of the best wes...,1,0.960503
7,"It actually pains me to say it, but this movie...",0,it actually pains me to say it but this movie...,0,0.999758
8,"Technically I'am a Van Damme Fan, or I was. th...",0,technically iam a van damme fan or i was thi...,0,0.999834
9,"Honestly awful film, bad editing, awful lighti...",0,honestly awful film bad editing awful lighti...,0,0.999762


In [20]:
# Inspecting the mismatched prediction 

mismatch1 = test1[test1['label'] != test1['predicted_label']]
mismatch1.iloc[0,2]

'first off let me say  if you havent enjoyed a van damme movie since bloodsport  you probably will not like this movie  most of these movies may not have the best plots or best actors but i enjoy these kinds of movies for what they are  this movie is much better than any of the movies the other action guys  segal and dolph  have thought about putting out the past few years  van damme is good in the movie  the movie is only worth watching to van damme fans  it is not as good as wake of death  which i highly recommend to anyone of likes van damme  or in hell but  in my opinion its worth watching  it has the same type of feel to it as nowhere to run  good fun stuff '

In [21]:
# Testing another batch 

test2 = tester.tail(10).copy()
test2_list = test2['preprocessed_text'].tolist()
test2_predictions = my_model(test2_list)

In [22]:
test2.loc[:, "predicted_label"] = [p["label"] for p in test2_predictions]
test2.loc[:, "confidence_score"] = [p["score"] for p in test2_predictions]
test2['predicted_label'] = test2['predicted_label'].map({'POS': 1, 'NEG': 0})

In [23]:
test2

Unnamed: 0,text,label,preprocessed_text,predicted_label,confidence_score
24990,I first saw this on Demand. Or on TV. I'm not ...,1,i first saw this on demand or on tv im not r...,1,0.99957
24991,In the veins of Jeepers Creepers and The Texas...,1,in the veins of jeepers creepers and the texas...,1,0.998655
24992,Great horror comedy from Michael Davis.Iwas la...,1,great horror comedy from michael davis iwas la...,1,0.999616
24993,Two city guys are driving through Hicksville U...,1,two city guys are driving through hicksville u...,1,0.997382
24994,This is a surprisingly great low budget Horror...,1,this is a surprisingly great low budget horror...,1,0.999611
24995,Just got around to seeing Monster Man yesterda...,1,just got around to seeing monster man yesterda...,1,0.999445
24996,I got this as part of a competition prize. I w...,1,i got this as part of a competition prize i w...,1,0.999109
24997,I got Monster Man in a box set of three films ...,1,i got monster man in a box set of three films ...,1,0.999265
24998,"Five minutes in, i started to feel how naff th...",1,five minutes in i started to feel how naff th...,1,0.99905
24999,I caught this movie on the Sci-Fi channel rece...,1,i caught this movie on the sci fi channel rece...,1,0.99858


No mismatches for test 2! 