# Transformers

In [1]:
import transformers
import torch 
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from transformers import TrainingArguments, Trainer
import pandas as pd

In [2]:
# load dataset
dataset = load_dataset("csv", data_files="IMDB Dataset.csv") #, split="train[50%:52%]")
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 50000
    })
})

In [3]:
dataset['train'][0]

{'review': "One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is du

## column operation

In [4]:
def replace_sentiment(example):
    return {"labels": 1 if example['sentiment']=='positive' else 0}
dataset=dataset.map(replace_sentiment)
dataset=dataset.remove_columns(['sentiment'])
dataset.num_rows

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

{'train': 50000}

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'labels'],
        num_rows: 50000
    })
})

## alternative: use pandas 

In [35]:
# load to pandas
df = pd.read_csv('IMDB Dataset.csv')
#replace sentiment
df['sentiment_n'] = df['sentiment'].replace({'negative': 0, 'positive': 1})
#df['sentiment_n'].value_counts()

#drop column 
df=df.drop(['sentiment'], axis=1)

# to pickle
df.to_pickle('imdb.pkl')

In [33]:
train_df = df.sample(frac=.8, random_state=16).reset_index(drop=True)
X_train, y_train = train_df['review'], train_df['sentiment_n']

test_df = df.drop(train_df.index).reset_index(drop=True)
X_test, y_test = test_df['review'], test_df['sentiment_n']
train_df

Unnamed: 0,review,sentiment_n
0,"On the surface, this movie would appear to dea...",0
1,Legendary director Sidney Lumet gives us one o...,1
2,This game is the bomb and this is the 007 game...,1
3,I enjoyed this film. I thought it was an excel...,1
4,This is one of the best horror movies i've see...,1
...,...,...
39995,This was quite possibly the worst movie I have...,0
39996,"""The Good Earth"" is a great movie that you don...",1
39997,"Dialogue: stilted, clichéd; Acting: hammy, cli...",0
39998,"Fay Grim is, on its face, a tale of espionage ...",1


In [36]:
dataset=load_dataset("pandas", data_files="imdb.pkl")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

  if _pandas_api.is_sparse(col):


## Load a pre-trained DistilBERT model

In [6]:
device="cuda" if torch.cuda.is_available() else "cpu"
model_chkpt="google/electra-small-discriminator"#'distilbert-base-uncased'

# Load a tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_chkpt)

# Function to tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["review"], padding="max_length", truncation=True, return_tensors='pt')

tokenized_dataset = dataset.map(tokenize_function, batched=True)
train_dataset=tokenized_dataset['train'].shuffle(seed=16).select(range(40000))
test_dataset=tokenized_dataset['train'].shuffle(seed=16).select(range(40000, 50000))
test_dataset

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset({
    features: ['review', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10000
})

In [7]:
train_dataset

Dataset({
    features: ['review', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 40000
})

In [85]:
#sentiment_n=[lambda x: 1 if x=="positive" else 0 for x in train_dataset['sentiment']]
#train_dataset.add_column("sentiment_n", sentiment_n)
#train_dataset['sentiment_n']=train_dataset['sentiment'].apply(lambda x: 1 if x=="positive" else 0)

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(model_chkpt, num_labels=2).to(device)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    # The pred object passed to compute_metrics by the Trainer is typically a named tuple with two attributes: 
    # predictions and label_ids.
    preds = pred.predictions.argmax(-1)
    # You're using argmax(-1) on pred.predictions to get the index of the highest value in the logits, 
    # which is standard for classification tasks. 
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

# below uses evaluate
#import evaluate
#metric = evaluate.load("accuracy", 'f1')
#def compute_metrics(pred):
#    logits, labels = pred
#    predictions = np.argmax(logits, axis=-1)
#    return metric.compute(predictions=predictions, references=labels)

In [10]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./IMDB_sentiment_ELECTRA-small',          
    num_train_epochs=3,              
    learning_rate=2e-4,
    per_device_train_batch_size=48,  
    per_device_eval_batch_size=64,
    evaluation_strategy='epoch'  
    #warmup_steps=500,                
    #weight_decay=0.01,               
    #logging_dir='./logs',            
    #logging_steps=10,
)
# Initialize the Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
   #label_names=['labels']
               
)
# Train the model
trainer.train()

  0%|          | 0/2502 [00:00<?, ?it/s]

You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.3176, 'learning_rate': 0.00016003197442046363, 'epoch': 0.6}


  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 0.20020700991153717, 'eval_accuracy': 0.9228, 'eval_f1': 0.9228027422071804, 'eval_runtime': 865.6671, 'eval_samples_per_second': 11.552, 'eval_steps_per_second': 0.181, 'epoch': 1.0}
{'loss': 0.2058, 'learning_rate': 0.00012006394884092726, 'epoch': 1.2}
{'loss': 0.1522, 'learning_rate': 8.00959232613909e-05, 'epoch': 1.8}


  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 0.2017057090997696, 'eval_accuracy': 0.9281, 'eval_f1': 0.9280509808842189, 'eval_runtime': 861.7957, 'eval_samples_per_second': 11.604, 'eval_steps_per_second': 0.182, 'epoch': 2.0}
{'loss': 0.1052, 'learning_rate': 4.012789768185452e-05, 'epoch': 2.4}
{'loss': 0.073, 'learning_rate': 1.5987210231814549e-07, 'epoch': 3.0}


  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 0.2403365522623062, 'eval_accuracy': 0.9314, 'eval_f1': 0.9313986938534399, 'eval_runtime': 861.9083, 'eval_samples_per_second': 11.602, 'eval_steps_per_second': 0.182, 'epoch': 3.0}
{'train_runtime': 28409.626, 'train_samples_per_second': 4.224, 'train_steps_per_second': 0.088, 'train_loss': 0.1706288747554965, 'epoch': 3.0}


TrainOutput(global_step=2502, training_loss=0.1706288747554965, metrics={'train_runtime': 28409.626, 'train_samples_per_second': 4.224, 'train_steps_per_second': 0.088, 'train_loss': 0.1706288747554965, 'epoch': 3.0})

#### push to HF_hub

In [11]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
#hide_output
trainer.push_to_hub(commit_message="accuracy over 93% for IMDB sentiment analysis with ELECTRA-small")

training_args.bin:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

'https://huggingface.co/knslee07/HF_results/tree/main/'

The weights can be downloaded at https://huggingface.co/knslee07/IMDB_sentiment_ELECTRA-small