# Libaries & Dependencies

In [1]:
%pip install "pandas"
%pip install "tqdm"
%pip install "torch==2.2.2" tensorboard
%pip install  --upgrade "transformers==4.40.0" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.22.2" "trl==0.8.6" "peft==0.10.0"


Collecting torch==2.2.2
  Downloading torch-2.2.2-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.2)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.2)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.2)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.2)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.2.2)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch==2.2.2)
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylin

In [2]:
import os
import random
import functools
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import evaluate
import re

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score

from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)


# Training dataset

## Visualise training dataset

Source: https://data.mendeley.com/preview/8kftmw7rct?a=0fd4d8fc-42e8-478b-bd17-ba61996aad61

Essentially we are only interested in our content (Tweet) and its intended meaning (Radical ? 0 unrelated).

Not feasible to include Author as accounts are regularly deleted from the platform.

In [53]:
# Read the Excel file
file_path = './datasets/Data_final.xlsx'
df = pd.read_excel(file_path)

# Display the first few rows
df.head(20)


Unnamed: 0,tweet id,created at,Tweet,Favourite count,Retweet count,Language,Author,in_reply_to_screen_name,in_reply_to_user_id_str,in_reply_to_status_id_str,username,Radical ? 0 unrelated
0,844546301392031745,2017-03-22 13:48:23,الثورة لن تنتهي وهناك كتائب اسلامية ومعركة دمش...,1,0,ar,1992Alkrem,,,,1992Alkrem,T
1,835579898697822209,2017-02-25 19:59:06,لم تموت الثورة بعد والقاتل يقتل حتى ولو بعد حي...,0,0,ar,1992Alkrem,,,,1992Alkrem,F
2,762659646045822978,2016-08-08 14:40:03,هل جيش الفتح قادر على تحرير حلب مع وجود ضغوط د...,1,0,ar,1992Alkrem,,,,1992Alkrem,F
3,692049401557159937,2016-01-26 18:20:09,مفخخات تنظيم الدوله الإسلامية توجع النظام ومؤي...,0,0,ar,1992Alkrem,,,,1992Alkrem,T
4,689094715912925185,2016-01-18 14:39:17,تنظيم الدوله الإسلاميه يتقدم في مطار ديرالزور ...,0,1,ar,1992Alkrem,,,,1992Alkrem,F
5,688886746373816321,2016-01-18 00:52:53,المناطق التي سيطر عليها التنظيم في الفترة الأخ...,0,0,ar,1992Alkrem,,,,1992Alkrem,T
6,688886233125228548,2016-01-18 00:50:51,استطاع تنظيم الدوله الإسلامية اطباق الحصار على...,0,0,ar,1992Alkrem,,,,1992Alkrem,T
7,688885858200604672,2016-01-18 00:49:22,تنظيم الدوله الإسلامية على ابواب اطعام 250 الف...,0,1,ar,1992Alkrem,,,,1992Alkrem,T
8,677140793975525376,2015-12-16 14:58:40,@USAbilAraby داعش هم المسلمين الذين ذاقو ويلات...,1,0,ar,1992Alkrem,USAbilAraby,249409411.0,6.770362e+17,1992Alkrem,T
9,677140297495785472,2015-12-16 14:56:42,@USAbilAraby لولا تدخلكم في شؤن المسلمين لما ت...,0,0,ar,1992Alkrem,USAbilAraby,249409411.0,6.770362e+17,1992Alkrem,F


In [54]:
df.shape

(24078, 12)

## Clean dataset

Label:

Negative (content not containing potential threats) - 0

Positive (content containing potential threats) - 1


In [55]:
# Filter out rows where 'Radical ? 0 unrelated' is either '0' or any value other than 'F' or 'T'
df_filtered = df[(df['Radical ? 0 unrelated'] != '0') & (df['Radical ? 0 unrelated'].isin(['F', 'T']))]

# Create the DataFrame
df_proc = pd.DataFrame()

# Populate df_proc based on the filtered data
df_proc['summary'] = df_filtered['Tweet']
df_proc['label'] = df_filtered['Radical ? 0 unrelated'].apply(lambda x: 'Negative' if x == 'F' else 'Positive')
df_proc['target'] = df_filtered['Radical ? 0 unrelated'].apply(lambda x: '0' if x == 'F' else '1')

# Define patterns to remove portions containing 'https', '@', and 'RT:' while keeping contents after space
patterns_to_remove = [r'http\S+', r'@\w+', r'RT :\s*']

# Apply the regex to remove portions and keep content before and after the matched pattern
for pattern in patterns_to_remove:
    df_proc['summary'] = df_proc['summary'].apply(lambda x: re.sub(pattern, '', str(x)).strip() if isinstance(x, str) else x)

# Remove rows where 'summary' has NaN values
df_proc = df_proc.dropna(subset=['summary'])

df_proc.reset_index(drop=True, inplace=True)

# Display the first few rows of df_proc
df_proc.head(20)

Unnamed: 0,summary,label,target
0,الثورة لن تنتهي وهناك كتائب اسلامية ومعركة دمش...,Positive,1
1,لم تموت الثورة بعد والقاتل يقتل حتى ولو بعد حي...,Negative,0
2,هل جيش الفتح قادر على تحرير حلب مع وجود ضغوط د...,Negative,0
3,مفخخات تنظيم الدوله الإسلامية توجع النظام ومؤي...,Positive,1
4,تنظيم الدوله الإسلاميه يتقدم في مطار ديرالزور ...,Negative,0
5,المناطق التي سيطر عليها التنظيم في الفترة الأخ...,Positive,1
6,استطاع تنظيم الدوله الإسلامية اطباق الحصار على...,Positive,1
7,تنظيم الدوله الإسلامية على ابواب اطعام 250 الف...,Positive,1
8,داعش هم المسلمين الذين ذاقو ويلات حروبكم واسلح...,Positive,1
9,لولا تدخلكم في شؤن المسلمين لما تدخل احد في شؤنكم,Negative,0


In [56]:
df_proc.shape

(21587, 3)

## Save cleaned dataset

In [34]:
df_proc.to_excel('./datasets/Data_cleaned.xlsx', index=False)

# Log in to Hugging Face
Access token: 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

In [27]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Llama 3 fine-tuning

## Read cleaned dataset

In [7]:
df_proc = pd.read_excel('./datasets/Data_cleaned.xlsx', nrows=3000) #Change nrows to desired amount
df_proc

Unnamed: 0,summary,label,target
0,الثورة لن تنتهي وهناك كتائب اسلامية ومعركة دمش...,Positive,1
1,لم تموت الثورة بعد والقاتل يقتل حتى ولو بعد حي...,Negative,0
2,هل جيش الفتح قادر على تحرير حلب مع وجود ضغوط د...,Negative,0
3,مفخخات تنظيم الدوله الإسلامية توجع النظام ومؤي...,Positive,1
4,تنظيم الدوله الإسلاميه يتقدم في مطار ديرالزور ...,Negative,0
...,...,...,...
2995,اتمنى اعرف انجازاتهم في الحرب ضد اليهود والصفو...,Positive,1
2996,رد بكلام له معنى نحن نعلم ان ذهبت بستين داهية ...,Positive,1
2997,لا تستغرب من خطيب المسجد الحرام فهو مجرد آلة ل...,Negative,0
2998,استغاثوا بالصليبيين واستغاثو بالصفويين واستغاث...,Positive,1


Remove any NaN values as some words could not be saved into xlsx file format

In [10]:
# Remove rows where 'summary' has NaN values
df_proc = df_proc.dropna(subset=['summary'])

df_proc.reset_index(drop=True, inplace=True)

In [11]:
df_proc[df_proc['summary'].isnull()]

Unnamed: 0,summary,label,target


In [12]:
df_proc.shape

(2998, 3)

In [13]:
df_proc['label']=df_proc['label'].astype('category')
df_proc['target']=df_proc['label'].cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_proc['label']=df_proc['label'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_proc['target']=df_proc['label'].cat.codes


In [14]:
df_proc['label'].cat.categories


Index(['Negative', 'Positive'], dtype='object')

In [15]:
category_map = {code: category for code, category in enumerate(df_proc['label'].cat.categories)}
category_map

{0: 'Negative', 1: 'Positive'}

## Split into train/val/test for later comparison.


In [16]:
train_end_point = int(df_proc.shape[0]*0.6)
val_end_point = int(df_proc.shape[0]*0.8)
df_train = df_proc.iloc[:train_end_point,:]
df_val = df_proc.iloc[train_end_point:val_end_point,:]
df_test = df_proc.iloc[val_end_point:,:]
print(df_train.shape, df_test.shape, df_val.shape)


(1798, 3) (600, 3) (600, 3)


## Convert from Pandas DataFrame to Hugging Face Dataset


In [17]:
# Converting pandas DataFrames into Hugging Face Dataset objects:
dataset_train = Dataset.from_pandas(df_train.drop('label',axis=1))
dataset_val = Dataset.from_pandas(df_val.drop('label',axis=1))
dataset_test = Dataset.from_pandas(df_test.drop('label',axis=1))


In [18]:
# Shuffle the training dataset
dataset_train_shuffled = dataset_train.shuffle(seed=42)  # Using a seed for reproducibility

In [19]:
# Combine them into a single DatasetDict
dataset = DatasetDict({
    'train': dataset_train_shuffled,
    'val': dataset_val,
    'test': dataset_test
})
dataset

DatasetDict({
    train: Dataset({
        features: ['summary', 'target'],
        num_rows: 1798
    })
    val: Dataset({
        features: ['summary', 'target'],
        num_rows: 600
    })
    test: Dataset({
        features: ['summary', 'target'],
        num_rows: 600
    })
})

In [20]:
dataset['train']

Dataset({
    features: ['summary', 'target'],
    num_rows: 1798
})

In [21]:
df_train.target.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
target,Unnamed: 1_level_1
0,0.780311
1,0.219689


In [22]:
class_weights=(1/df_train.target.value_counts(normalize=True).sort_index()).tolist()
class_weights=torch.tensor(class_weights)
class_weights=class_weights/class_weights.sum()
class_weights


tensor([0.2197, 0.7803])

## Load LLama model with 4 bit quantization as specified in bits and bytes and prepare model for peft training


In [23]:
model_name = "meta-llama/Meta-Llama-3-8B"

In [24]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)


In [25]:
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

In [28]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=2
)

model

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )


In [29]:
model = prepare_model_for_kbit_training(model)
model

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )


In [30]:
model = get_peft_model(model, lora_config)
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
        

In [31]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [32]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

In [33]:
sentences = df_test.summary.tolist()
sentences[0:2]

[')ربكم أعلم بما في نفوسكم (\nلايضرنك ظنون الخلق واراءهم \nأصلح مابينك وبين الله ثم امض\nمطمئنا فهو أعلم بك \n# اللهم فك اسرنا',
 'صلاة الضحى\nعن أبو هريرة رضي الله عنه\nاوصاني خليلي رسول الله صلى الله عليه وسلم\nبصيام ثلاثة أيام من كل شهر \nوركعتي الضحى\nوان اوتر قبل أن ارقد']

In [34]:
from tqdm import tqdm

# Check if a CUDA-enabled GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Move the model to the same device (GPU or CPU)
model = model.to(device)

# Convert summaries to a list
sentences = df_test.summary.tolist()

# Define the batch size
batch_size = 16  # You can adjust this based on your system's memory capacity

# Initialize an empty list to store the model outputs
all_outputs = []

# Process the sentences in batches and show a progress bar
for i in tqdm(range(0, len(sentences), batch_size), desc="Processing batches"):
    # Get the batch of sentences
    batch_sentences = sentences[i:i + batch_size]

    # Tokenize the batch
    inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Move tensors to the same device as the model (GPU or CPU)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Perform inference and store the logits
    with torch.no_grad():
        outputs = model(**inputs)
        all_outputs.append(outputs['logits'])

Processing batches: 100%|██████████| 38/38 [01:07<00:00,  1.78s/it]


In [35]:
final_outputs = torch.cat(all_outputs, dim=0)
final_outputs

tensor([[-5.1219,  0.3810],
        [-7.1467, -0.2947],
        [-3.7219, -3.6922],
        ...,
        [-1.9352, -4.1272],
        [-3.9582, -3.6519],
        [-2.4500, -2.5647]], device='cuda:0')

In [36]:
final_outputs.argmax(axis=1)

tensor([1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
        1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
        1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
        0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
        1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
        0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,
        1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
        0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
        0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
        1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1,

In [37]:
df_test['predictions']=final_outputs.argmax(axis=1).cpu().numpy()
df_test['predictions']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predictions']=final_outputs.argmax(axis=1).cpu().numpy()


Unnamed: 0,predictions
2398,1
2399,1
2400,1
2401,0
2402,0
...,...
2993,0
2994,1
2995,0
2996,1


In [38]:
df_test['predictions'].value_counts()

Unnamed: 0_level_0,count
predictions,Unnamed: 1_level_1
0,372
1,228


In [39]:
df_test['predictions']=df_test['predictions'].apply(lambda l:category_map[l])
df_test['predictions']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predictions']=df_test['predictions'].apply(lambda l:category_map[l])


Unnamed: 0,predictions
2398,Positive
2399,Positive
2400,Positive
2401,Negative
2402,Negative
...,...
2993,Negative
2994,Positive
2995,Negative
2996,Positive


## Analyse Llama 3 performance before training

In [40]:
def get_performance_metrics(df_test):
  y_test = df_test.label
  y_pred = df_test.predictions

  print("Confusion Matrix:")
  print(confusion_matrix(y_test, y_pred))

  print("\nClassification Report:")
  print(classification_report(y_test, y_pred))

  print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
  print("Accuracy Score:", accuracy_score(y_test, y_pred))

In [41]:
get_performance_metrics(df_test)

Confusion Matrix:
[[210 112]
 [162 116]]

Classification Report:
              precision    recall  f1-score   support

    Negative       0.56      0.65      0.61       322
    Positive       0.51      0.42      0.46       278

    accuracy                           0.54       600
   macro avg       0.54      0.53      0.53       600
weighted avg       0.54      0.54      0.54       600

Balanced Accuracy Score: 0.534720050046919
Accuracy Score: 0.5433333333333333


In [42]:
MAX_LEN = 512
col_to_delete = ['summary']

def llama_preprocessing_function(examples):
    return tokenizer(examples['summary'], truncation=True, max_length=MAX_LEN)

tokenized_datasets = dataset.map(llama_preprocessing_function, batched=True, remove_columns=col_to_delete)
tokenized_datasets = tokenized_datasets.rename_column("target", "label")
tokenized_datasets.set_format("torch")

Map:   0%|          | 0/1798 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

## Data collator

In [43]:
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

## Define which metrics to compute for evaluation

In [44]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'balanced_accuracy' : balanced_accuracy_score(predictions, labels),'accuracy':accuracy_score(predictions,labels)}




In [45]:
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Ensure label_weights is a tensor
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels and convert them to long type for cross_entropy
        labels = inputs.pop("labels").long()

        # Forward pass
        outputs = model(**inputs)

        # Extract logits assuming they are directly outputted by the model
        logits = outputs.get('logits')

        # Compute custom loss with class weights for imbalanced data handling
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss


## Define training args

In [46]:
training_args = TrainingArguments(
    output_dir = 'sentiment_classification',
    learning_rate = 1e-4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 2,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True,
)

In [47]:
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['val'],
    tokenizer = tokenizer,
    data_collator = collate_fn,
    compute_metrics = compute_metrics,
    class_weights=class_weights,
)

  self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)


## Run trainer

In [48]:
train_result = trainer.train()



Epoch,Training Loss,Validation Loss,Balanced Accuracy,Accuracy
1,No log,0.240206,0.73221,0.913333
2,No log,0.276852,0.765849,0.923333




## Analyse Llama 3 performance after training

In [49]:
def make_predictions(model,df_test):


  # Convert summaries to a list
  sentences = df_test.summary.tolist()

  # Define the batch size
  batch_size = 32  # You can adjust this based on your system's memory capacity

  # Initialize an empty list to store the model outputs
  all_outputs = []

  # Process the sentences in batches
  for i in range(0, len(sentences), batch_size):
      # Get the batch of sentences
      batch_sentences = sentences[i:i + batch_size]

      # Tokenize the batch
      inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

      # Move tensors to the device where the model is (e.g., GPU or CPU)
      inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

      # Perform inference and store the logits
      with torch.no_grad():
          outputs = model(**inputs)
          all_outputs.append(outputs['logits'])
  final_outputs = torch.cat(all_outputs, dim=0)
  df_test['predictions']=final_outputs.argmax(axis=1).cpu().numpy()
  df_test['predictions']=df_test['predictions'].apply(lambda l:category_map[l])


make_predictions(model,df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predictions']=final_outputs.argmax(axis=1).cpu().numpy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predictions']=df_test['predictions'].apply(lambda l:category_map[l])


In [50]:
get_performance_metrics(df_test)


Confusion Matrix:
[[241  81]
 [ 79 199]]

Classification Report:
              precision    recall  f1-score   support

    Negative       0.75      0.75      0.75       322
    Positive       0.71      0.72      0.71       278

    accuracy                           0.73       600
   macro avg       0.73      0.73      0.73       600
weighted avg       0.73      0.73      0.73       600

Balanced Accuracy Score: 0.7321372715492203
Accuracy Score: 0.7333333333333333


In [51]:
metrics = train_result.metrics
max_train_samples = len(dataset_train)
metrics["train_samples"] = min(max_train_samples, len(dataset_train))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

***** train metrics *****
  epoch                    =        2.0
  total_flos               =  8448844GF
  train_loss               =     0.5619
  train_runtime            = 0:29:48.38
  train_samples            =       1798
  train_samples_per_second =      2.011
  train_steps_per_second   =      0.252


## Save model files

In [52]:
trainer.save_model("saved_model")
