In [1]:
!pip install transformers
!pip install datasets
!pip install peft
!pip install evaluate
!pip install torch
!pip install numpy
!pip install pandas
!pip install scikit-learn



In [31]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    GPT2Tokenizer,
    GPT2ForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import pandas as pd


### Data To Train On

In [3]:
neologism_data = pd.read_csv('base_data_non_genz.csv')

In [4]:
neologism_data['sentence'] = neologism_data['sentence'].astype(str)
neologism_data['sentiment'] = neologism_data['sentiment'].astype(str)
def assign_label(sentiment):
    if sentiment == 'positive':
        return 2
    elif sentiment == 'neutral':
        return 1
    else:
        return 0

neologism_data['label'] = neologism_data['sentiment'].apply(assign_label)

In [5]:
x_train, x_test, y_train, y_test = train_test_split(
    neologism_data['sentence'], neologism_data['label'], test_size=0.2, random_state=42
)

# Create DatasetDict
dataset = DatasetDict({
    'train': Dataset.from_dict({'label': y_train.tolist(), 'sentence': x_train.tolist()}),
    'validation': Dataset.from_dict({'label': y_test.tolist(), 'sentence': x_test.tolist()})
})

In [6]:
neologism_data

Unnamed: 0,word,sentence,sentiment,label
0,cx,My new phone's cx is unbelievably smooth; scr...,positive,2
1,crispr,Scientists are using crispr technology to edit...,positive,2
2,defi,"Despite the market volatility, my defi invest...",positive,2
3,oled,"Despite the higher price, the oled screen's vi...",positive,2
4,longtermism,"Despite the immediate crisis, the government's...",positive,2
...,...,...,...,...
2500,hallucination,Her vivid descriptions of the alien abduction ...,negative,0
2501,social distancing,"Despite the initial inconvenience, social dist...",positive,2
2502,twindemic,This year's twindemic of flu and RSV cases ove...,negative,0
2503,mald,"After losing the championship, he malded spect...",negative,0


### Data To Test With

In [7]:
reddit_df = pd.read_csv('the-reddit-dataset-dataset-comments.csv')

In [8]:
print(reddit_df['body'].dtype)
print(reddit_df['sentiment'].dtype)

object
float64


In [9]:
reddit_df = reddit_df.dropna(subset=['body', 'sentiment'])
reddit_df['body'] = reddit_df['body'].astype(str)
reddit_df['sentiment'] = reddit_df['sentiment'].astype(float)
def assign_label(score):
    if score < -0.5:
        return 0  # Negative
    elif -0.5 <= score <= 0.5:
        return 1  # Neutral
    else:
        return 2  # Positive

reddit_df['label'] = reddit_df['sentiment'].apply(assign_label)

In [10]:
neo_words = neologism_data.word
neo_words_set = set(neo_words.str.lower())

In [11]:
fil_reddit_df = reddit_df[reddit_df['body'].str.contains('|'.join(neo_words_set), case=False, na=False)]

### Model

In [32]:
# model_checkpoint = 'distilbert-base-cased'
# model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer

model_checkpoint = 'gpt2'

id2label = {0: "negative", 1: "positive", 2: "neutral"}
label2id = {"negative":0, "positive":1, "neutral": 2}

# create tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token


model = GPT2ForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=3, id2label=id2label, label2id=label2id)
model.config.pad_token_id = tokenizer.pad_token_id

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
# display architecture
model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=3, bias=False)
)

### preprocess data

In [34]:
# # create tokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# # add pad token if none exists
# if tokenizer.pad_token is None:
#     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#     model.resize_token_embeddings(len(tokenizer))

In [35]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["sentence"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [36]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/2004 [00:00<?, ? examples/s]

Map:   0%|          | 0/501 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'sentence', 'input_ids', 'attention_mask'],
        num_rows: 2004
    })
    validation: Dataset({
        features: ['label', 'sentence', 'input_ids', 'attention_mask'],
        num_rows: 501
    })
})

In [37]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### evaluation

In [38]:
accuracy_eval = evaluate.load("accuracy")

In [39]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy_eval.compute(predictions=predictions, references=labels)}

### Apply untrained model to text

In [40]:
text_list = ["Listening to the retro playlist filled with 80's synth-pop hits, he was overwhelmed by a wave of falstalagia.", "That fit is straight fire, no cap, you're looking mad schmick", "Taylor swift’s new album just slaps hard.", "The candidate's speech was pure clickbait, all sizzle and no steak.", "Absolutely love how our talent pool is just bursting with sparkle-genius nepo babies, each one more deserving for success than anyone who actually worked for it."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
Listening to the retro playlist filled with 80's synth-pop hits, he was overwhelmed by a wave of falstalagia. - negative
That fit is straight fire, no cap, you're looking mad schmick - negative
Taylor swift’s new album just slaps hard. - negative
The candidate's speech was pure clickbait, all sizzle and no steak. - negative
Absolutely love how our talent pool is just bursting with sparkle-genius nepo babies, each one more deserving for success than anyone who actually worked for it. - negative


In [41]:
import torch
import pandas as pd

# Create a list to store results
results = []

# Iterate through the DataFrame rows
for index, row in reddit_df.iterrows():
    text = row['body']
    # print(text)
    true_label = row['label']  # Assuming the column name for labels is 'label'

    # Tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Compute logits
    logits = model(inputs).logits

    # Convert logits to label
    predictions = torch.argmax(logits)

    # Map the predicted label to human-readable form
    predicted_label = predictions.tolist()

    # Append results to the list
    results.append({'text': text, 'true_label': true_label, 'predicted_label': predicted_label})

# Convert the results into a DataFrame
predictions_df = pd.DataFrame(results)

In [42]:
predictions_df

Unnamed: 0,text,true_label,predicted_label
0,Spatial problem: Suitability of new locations ...,1,0
1,Have you tried toying around with GDELT or Ali...,1,0
2,Damn random internet person of whom I know not...,1,0
3,Ah nice one. Best of luck with the baby. If yo...,2,0
4,I was about to write and say this shouldn't be...,1,0
...,...,...,...
47365,full list here: http://developer.amazonwebserv...,1,0
47366,This was posted in another thread.\r\n\r\nhttp...,1,0
47367,Careful of the licence on this one.,1,0
47368,Also a great example of exposing an API with v...,2,0


In [43]:
accuracy = (predictions_df['true_label'] == predictions_df['predicted_label']).mean()

# Print results
print("Predictions DataFrame:")
print(predictions_df.head())  # Display first few rows of predictions

print(f"\nAccuracy: {accuracy:.4f}")

Predictions DataFrame:
                                                text  true_label  \
0  Spatial problem: Suitability of new locations ...           1   
1  Have you tried toying around with GDELT or Ali...           1   
2  Damn random internet person of whom I know not...           1   
3  Ah nice one. Best of luck with the baby. If yo...           2   
4  I was about to write and say this shouldn't be...           1   

   predicted_label  
0                0  
1                0  
2                0  
3                0  
4                0  

Accuracy: 0.0762


### Train model

In [44]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules=['c_attn', 'c_proj'])

In [45]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'c_attn', 'c_proj'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [46]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 407,808 || all params: 124,849,920 || trainable%: 0.3266




In [47]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 10

In [48]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [50]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

  trainer = Trainer(


  0%|          | 0/5010 [00:00<?, ?it/s]

{'loss': 0.722, 'grad_norm': 16.031253814697266, 'learning_rate': 0.0009001996007984033, 'epoch': 1.0}


  0%|          | 0/126 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.7844311377245509}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.8645496964454651, 'eval_accuracy': {'accuracy': 0.7844311377245509}, 'eval_runtime': 3.5241, 'eval_samples_per_second': 142.164, 'eval_steps_per_second': 35.754, 'epoch': 1.0}
{'loss': 0.6233, 'grad_norm': 2.7890474796295166, 'learning_rate': 0.0008003992015968064, 'epoch': 2.0}


  0%|          | 0/126 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.8063872255489022}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.7009407877922058, 'eval_accuracy': {'accuracy': 0.8063872255489022}, 'eval_runtime': 3.3741, 'eval_samples_per_second': 148.484, 'eval_steps_per_second': 37.343, 'epoch': 2.0}
{'loss': 0.6022, 'grad_norm': 0.2976591885089874, 'learning_rate': 0.0007005988023952096, 'epoch': 2.99}


  0%|          | 0/126 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.8143712574850299}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 1.1990628242492676, 'eval_accuracy': {'accuracy': 0.8143712574850299}, 'eval_runtime': 3.5338, 'eval_samples_per_second': 141.774, 'eval_steps_per_second': 35.656, 'epoch': 3.0}
{'loss': 0.558, 'grad_norm': 7.786545276641846, 'learning_rate': 0.0006007984031936128, 'epoch': 3.99}


  0%|          | 0/126 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.8143712574850299}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.8622522354125977, 'eval_accuracy': {'accuracy': 0.8143712574850299}, 'eval_runtime': 3.6882, 'eval_samples_per_second': 135.839, 'eval_steps_per_second': 34.163, 'epoch': 4.0}
{'loss': 0.5231, 'grad_norm': 0.1532604694366455, 'learning_rate': 0.000500998003992016, 'epoch': 4.99}


  0%|          | 0/126 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.7964071856287425}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.9570766687393188, 'eval_accuracy': {'accuracy': 0.7964071856287425}, 'eval_runtime': 3.3125, 'eval_samples_per_second': 151.245, 'eval_steps_per_second': 38.038, 'epoch': 5.0}
{'loss': 0.4885, 'grad_norm': 12.65489673614502, 'learning_rate': 0.0004011976047904192, 'epoch': 5.99}


  0%|          | 0/126 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.810379241516966}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.8069131374359131, 'eval_accuracy': {'accuracy': 0.810379241516966}, 'eval_runtime': 3.3249, 'eval_samples_per_second': 150.681, 'eval_steps_per_second': 37.896, 'epoch': 6.0}
{'loss': 0.4487, 'grad_norm': 20.17593765258789, 'learning_rate': 0.0003013972055888224, 'epoch': 6.99}


  0%|          | 0/126 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.7844311377245509}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.9719657301902771, 'eval_accuracy': {'accuracy': 0.7844311377245509}, 'eval_runtime': 3.2739, 'eval_samples_per_second': 153.028, 'eval_steps_per_second': 38.486, 'epoch': 7.0}
{'loss': 0.4312, 'grad_norm': 52.96881866455078, 'learning_rate': 0.00020159680638722556, 'epoch': 7.98}


  0%|          | 0/126 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.810379241516966}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.9140940308570862, 'eval_accuracy': {'accuracy': 0.810379241516966}, 'eval_runtime': 3.2412, 'eval_samples_per_second': 154.574, 'eval_steps_per_second': 38.875, 'epoch': 8.0}
{'loss': 0.3994, 'grad_norm': 4.675459861755371, 'learning_rate': 0.00010179640718562875, 'epoch': 8.98}


  0%|          | 0/126 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.8063872255489022}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.9120633006095886, 'eval_accuracy': {'accuracy': 0.8063872255489022}, 'eval_runtime': 3.3074, 'eval_samples_per_second': 151.479, 'eval_steps_per_second': 38.097, 'epoch': 9.0}
{'loss': 0.3739, 'grad_norm': 25.19195556640625, 'learning_rate': 1.996007984031936e-06, 'epoch': 9.98}


  0%|          | 0/126 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.8063872255489022}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.9734070301055908, 'eval_accuracy': {'accuracy': 0.8063872255489022}, 'eval_runtime': 3.2566, 'eval_samples_per_second': 153.84, 'eval_steps_per_second': 38.69, 'epoch': 10.0}
{'train_runtime': 410.8782, 'train_samples_per_second': 48.774, 'train_steps_per_second': 12.193, 'train_loss': 0.5168248210838455, 'epoch': 10.0}


TrainOutput(global_step=5010, training_loss=0.5168248210838455, metrics={'train_runtime': 410.8782, 'train_samples_per_second': 48.774, 'train_steps_per_second': 12.193, 'total_flos': 281535628861440.0, 'train_loss': 0.5168248210838455, 'epoch': 10.0})

### Generate prediction

In [51]:
model.to('mps')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("mps") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
Listening to the retro playlist filled with 80's synth-pop hits, he was overwhelmed by a wave of falstalagia. - negative
That fit is straight fire, no cap, you're looking mad schmick - negative
Taylor swift’s new album just slaps hard. - negative
The candidate's speech was pure clickbait, all sizzle and no steak. - negative
Absolutely love how our talent pool is just bursting with sparkle-genius nepo babies, each one more deserving for success than anyone who actually worked for it. - neutral


In [52]:
import torch
import pandas as pd

# Assuming reddit_df has 'body' for text and 'label' for the true labels
results = []

# Move the model to MPS (if using a Mac)
model.to('mps')

print("Trained model predictions:")
print("--------------------------")

# Iterate over the dataframe rows
for index, row in fil_reddit_df.iterrows():
    text = row['body']
    true_label = row['label']  # Assuming column 'label' contains the true labels

    # Tokenize the text with padding and truncation
    inputs = tokenizer.encode(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to("mps")

    # Compute logits using the model
    logits = model(inputs).logits

    # Get the predicted label by finding the index of the max logits
    predictions = torch.max(logits, 1).indices

    # Map prediction to the corresponding label
    predicted_label = id2label[predictions.tolist()[0]]

    # Append the results to the list
    results.append({
        'text': text,
        'true_label': true_label,
        'predicted_label': predicted_label
    })

# Convert the results into a DataFrame
predictions_df_new = pd.DataFrame(results)


Trained model predictions:
--------------------------


In [53]:
predictions_df_new['predicted_label'] = predictions_df_new['predicted_label'].map(label2id)

In [54]:
accuracy = (predictions_df_new['true_label'] == predictions_df_new['predicted_label']).mean()

# Print the DataFrame and accuracy
print("Predictions DataFrame:")
print(predictions_df_new.head())  # Display the first few rows of predictions

print(f"\nAccuracy: {accuracy:.4f}")

Predictions DataFrame:
                                                text  true_label  \
0  Damn random internet person of whom I know not...           1   
1  Ah nice one. Best of luck with the baby. If yo...           2   
2  I was about to write and say this shouldn't be...           1   
3   I'm not exactly sure how many contracts the E...           1   
4  nevermind, found it\n\nfor anyone in need:\n\n...           1   

   predicted_label  
0                2  
1                2  
2                0  
3                0  
4                2  

Accuracy: 0.3602


### Optional: push model to hub

In [None]:
# option 1: notebook login
# from huggingface_hub import notebook_login
# notebook_login() # ensure token gives write access

# # # option 2: key login
# # from huggingface_hub import login
# # write_key = 'hf_' # paste token here
# # login(write_key)

In [None]:
# hf_name = 'shawhin' # your hf username or org name
# model_id = hf_name + "/" + model_checkpoint + "-lora-text-classification" # you can name the model whatever you want

In [None]:
# model.push_to_hub(model_id) # save model

In [None]:
# trainer.push_to_hub(model_id) # save trainer

### Optional: load peft model

In [None]:
# # how to load peft model from hub for inference
# config = PeftConfig.from_pretrained(model_id)
# inference_model = AutoModelForSequenceClassification.from_pretrained(
#     config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id
# )
# tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
# model = PeftModel.from_pretrained(inference_model, model_id)