In [3]:
import torch 
from transformers import LlamaForSequenceClassification
from transformers import AutoTokenizer
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
)
from torch import nn
import pandas as pd
import re

In [2]:
model_path = 'openlm-research/open_llama_3b' # https://huggingface.co/openlm-research/open_llama_3b
num_labels = 4 # 'Positive', 'Neutral', 'Negative', 'Irrelevant' (https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis/versions/2)

model = LlamaForSequenceClassification.from_pretrained(model_path, num_labels=num_labels, torch_dtype=torch.float16, device_map='mps')
# Tokenize the input: Use the AutoTokenizer class from the transformers library to tokenize your input text. Set the tokenizer options according to your classification task.

tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    use_fast=False,
    trust_remote_code=True,
    padding_side="left",
    pad_token="<|endoftext|>"
)

model.resize_token_embeddings(len(tokenizer)) # https://github.com/huggingface/transformers/issues/1805

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at openlm-research/open_llama_3b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Embedding(32001, 3200)

In [3]:
# Accede a la última capa principal
#last_layer = model.classifier
# Muestra la última capa principal
#print(last_layer)
last_hidden_layer = model.get_input_embeddings()
print(last_hidden_layer)

Embedding(32001, 3200)


In [4]:
print(model)

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32001, 3200)
    (layers): ModuleList(
      (0-25): 26 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (k_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (v_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (o_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3200, out_features=8640, bias=False)
          (up_proj): Linear(in_features=3200, out_features=8640, bias=False)
          (down_proj): Linear(in_features=8640, out_features=3200, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()

### Probando LLAMA

In [4]:
twitterDFTrain = pd.read_csv("../datasets/twitter/twitter_training.csv")
twitterDFVal = pd.read_csv("../datasets/twitter/twitter_validation.csv")

In [5]:
twitterClasses = {'Negative':0, 'Positive':1, 'Neutral':2, 'Irrelevant':3}
twitterDFTrain["sentiment"] = 0
twitterDFVal["sentiment"] = 0
for keyStr in twitterClasses.keys():
    twitterDFTrain["sentiment"][twitterDFTrain["Positive"]==keyStr] = twitterClasses[keyStr]
    twitterDFVal["sentiment"][twitterDFVal["Irrelevant"]==keyStr] = twitterClasses[keyStr]

twitterDFTrain["text"] = twitterDFTrain[list(twitterDFTrain.columns)[3]]
twitterDFVal["text"] = twitterDFVal[list(twitterDFVal.columns)[3]]
twitterDFTrain = twitterDFTrain[["text", "sentiment"]] # We only need the tweets and their sentiments for training LLaMA
twitterDFVal = twitterDFVal[["text", "sentiment"]] # We only need the tweets and their sentiments for training LLaMA

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitterDFTrain["sentiment"][twitterDFTrain["Positive"]==keyStr] = twitterClasses[keyStr]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitterDFVal["sentiment"][twitterDFVal["Irrelevant"]==keyStr] = twitterClasses[keyStr]


In [6]:
def basic_cleaning(text):
    # Capture swear words that are **** out return text
    text=re.sub(r'https?://www\.\S+\.com','',text)
    text=re.sub(r'[^A-Za-z|\s]','',text)
    text=re.sub(r'\*+','swear',text)
    return text

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F" #emoticons
                               u"\U0001F300-\U0001F5FF" #symbols & pictographs
                               u"\U0001F680-\U0001F6FF" #transport & map symbols
                               u"\U0001F1E0-\U0001F1FF" #flags (iOS)
                               u"\U00002702-\U000027B0" 
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_multiplechars(text):
    text = re.sub(r'(.)\1{3,}',r'\1', text)
    return text

def clean(df):
    for col in ['text']:#,'selected_text']:
        df[col]=df[col].astype(str).apply(lambda x:basic_cleaning(x))
        df[col]=df[col].astype(str).apply(lambda x:remove_emoji(x))
        df[col]=df[col].astype(str).apply(lambda x:remove_html(x))
        df[col]=df[col].astype(str).apply(lambda x:remove_multiplechars(x))
    return df

In [7]:
twitterDFTrain_clean = clean(twitterDFTrain)
twitterDFVal_clean = clean(twitterDFVal)

In [8]:
twitterDFTrain_clean # train set contents

Unnamed: 0,text,sentiment
0,I am coming to the borders and I will kill you...,1
1,im getting on borderlands and i will kill you all,1
2,im coming on borderlands and i will murder you...,1
3,im getting on borderlands and i will murder y...,1
4,im getting into borderlands and i can murder y...,1
...,...,...
74676,Just realized that the Windows partition of my...,1
74677,Just realized that my Mac window partition is ...,1
74678,Just realized the windows partition of my Mac ...,1
74679,Just realized between the windows partition of...,1


In [21]:

twitterDFVal_clean # validation set contents

Unnamed: 0,text,sentiment
0,BBC News Amazon boss Jeff Bezos rejects claim...,2
1,Microsoft Why do I pay for WORD when it functi...,0
2,CSGO matchmaking is so full of closet hacking ...,0
3,Now the President is slapping Americans in the...,2
4,Hi EAHelp Ive had Madeleine McCann in my cella...,0
...,...,...
994,Toronto is the arts and culture capital of Ca...,3
995,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,3
996,Today sucked so its time to drink wine n play ...,1
997,Bought a fraction of Microsoft today Small wins,1


In [9]:
from tqdm import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchmetrics import Accuracy

### Probando dataset con el mismo del archivo LLAMA

In [10]:
class CustomDataset(Dataset):
    def __init__(self, twitterDF):
        self.twitterDF = twitterDF

    def __len__(self):
        return len(self.twitterDF.index)

    def __getitem__(self, idx):
        return np.array([idx])

In [11]:
training_data = CustomDataset(twitterDFTrain_clean)
validation_data = CustomDataset(twitterDFVal_clean)

train_dataloader = DataLoader(training_data, batch_size=4, shuffle=True)
val_dataloader = DataLoader(validation_data, batch_size=4, shuffle=False)

In [12]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu") 

In [27]:
type(training_data)

__main__.CustomDataset

### Probando dataset con el mismo del archivo analisis senti BER

In [28]:
#crearemos esta clase para que bert pueda leerlos (cada reviw)
# CREACIÓN DATASET(heredamos Dataset de Pytorch)
class IMDBDataset(Dataset):
    def __init__(self,reviews,labels,tokenizer,max_len):
        self.reviews = reviews #comentarios
        self.labels = labels #si es positivo o negativo
        self.tokenizer = tokenizer #lo que nos permite convertir el texto en tokens(formato de entrada que requiere bert)
        self.max_len = max_len #Longitud mmaxima
    
    def __len__(self):
        return len(self.reviews) #longitud del set de datos
  
  #cuando estemos creando los paquetes de 16 datos, pytorch ira llamando a esta funcion
  #para presentar al modelo
    def __getitem__(self, item):
        review = str(self.reviews[item])#toma un review(texto en bruto)
        label = self.labels[item] #toma su labels (etiqueta)
        #convertir en representacion numerica(con encode_plus) el texto en bruto
        encoding = tokenizer.encode_plus(
            review,
            max_length = self.max_len,
            truncation = True,
            add_special_tokens = True,
            return_token_type_ids = False,
            pad_to_max_length = True,
            return_attention_mask = True,
            return_tensors = 'pt'
        )   
        #devuelva esta funcion: el review, inputs id(representacion numerica de cada token, etiqueta)
        return {
              'review': review,
              'input_ids': encoding['input_ids'].flatten(),
              'attention_mask': encoding['attention_mask'].flatten(),
              'label': torch.tensor(label, dtype=torch.long)
          } 

In [38]:
# Creamos nuestro Data loader (es quien llamara a la clase IMDBDataset):
#Recordar que el review y label deben de convertirse en numpy
#num_workers: para que haga 4 procesos a la vez. En nuestro caso si caca batch_size tiene 16 datos
#entonces analizara de 4 en 4...
#Detalle: i
#-se convierte a numpyt cada review y label del dataset
#-se usa el tokenizer que creamos y viene del bert pre entrenado

def data_loader(df, tokenizer, max_len, batch_size):
    dataset = IMDBDataset(
    reviews = df.text.to_numpy(),
    labels = df.sentiment.to_numpy(),
    tokenizer = tokenizer,
    max_len = MAX_LEN
    )
    return DataLoader(dataset, batch_size = BATCH_SIZE, num_workers = 4) 

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
df = twitterDFTrain_clean
RANDOM_SEED = 42
model_path = 'openlm-research/open_llama_3b'
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    use_fast=False,
    trust_remote_code=True,
    padding_side="left",
    pad_token="<|endoftext|>"
)

#dividir nuestro set de datos(train_test_split ya lo habiamos importado antes)
df_train, df_test = train_test_split(df, test_size = 0.2, random_state=RANDOM_SEED)
#creamos los dataloader:
train_data_loader = data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [48]:
print(train_data_loader)

<torch.utils.data.dataloader.DataLoader object at 0x147497e50>


### Modelo

In [46]:
model_path = 'openlm-research/open_llama_3b'
num_labels = 4
class LLamaSentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(LLamaSentimentClassifier, self).__init__()
        #self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME) #bert pre entrenado
        self.llama = LLaMaForSequenceClassification.from_pretrained(model_path, num_labels=n_classes, torch_dtype=torch.float16, device_map='mps')
        #self.llama = LLaMaForSequenceClassification.from_pretrained(model_path)
        self.drop = nn.Dropout(p=0.3)
        self.linear = nn.Linear(self.llama.config.hidden_size, n_classes) #tendra una capa lineal

    def forward(self, input_ids, attention_mask):
        #primero paso los datos por bert. Me devolvera dos datos:
        #todos los vectores correspondientes a la codificacion de las frases de entrada
        #codificacion del token de clasificacion(el que nos interesa) y 
        #tiene toda la escencia de la frase y procesa la red neuronal
        _, cls_output = self.llama(
        input_ids = input_ids,
        attention_mask = attention_mask
        )
        
        drop_output = self.drop(cls_output)
        output = self.linear(drop_output)
        return output

In [47]:
LlamaModel = LLamaSentimentClassifier(num_labels)
LlamaModel  = LlamaModel.to(device)

NameError: name 'LLaMaForSequenceClassification' is not defined

### Mostrar los layers

In [15]:
total_layers = len(list(model.modules()))
print(total_layers)

370


In [8]:
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

model.embed_tokens.weight 	 torch.Size([32001, 3200])
model.layers.0.self_attn.q_proj.weight 	 torch.Size([3200, 3200])
model.layers.0.self_attn.k_proj.weight 	 torch.Size([3200, 3200])
model.layers.0.self_attn.v_proj.weight 	 torch.Size([3200, 3200])
model.layers.0.self_attn.o_proj.weight 	 torch.Size([3200, 3200])
model.layers.0.mlp.gate_proj.weight 	 torch.Size([8640, 3200])
model.layers.0.mlp.up_proj.weight 	 torch.Size([8640, 3200])
model.layers.0.mlp.down_proj.weight 	 torch.Size([3200, 8640])
model.layers.0.input_layernorm.weight 	 torch.Size([3200])
model.layers.0.post_attention_layernorm.weight 	 torch.Size([3200])
model.layers.1.self_attn.q_proj.weight 	 torch.Size([3200, 3200])
model.layers.1.self_attn.k_proj.weight 	 torch.Size([3200, 3200])
model.layers.1.self_attn.v_proj.weight 	 torch.Size([3200, 3200])
model.layers.1.self_attn.o_proj.weight 	 torch.Size([3200, 3200])
model.layers.1.mlp.gate_proj.weight 	 torch.Size([8640, 3200])
model.layers.1.mlp.up_proj.weight 	 torch

In [9]:
from torchsummary import summary
summary(model)


Layer (type:depth-idx)                        Param #
├─LlamaModel: 1-1                             --
|    └─Embedding: 2-1                         102,403,200
|    └─ModuleList: 2-2                        --
|    |    └─LlamaDecoderLayer: 3-1            123,910,400
|    |    └─LlamaDecoderLayer: 3-2            123,910,400
|    |    └─LlamaDecoderLayer: 3-3            123,910,400
|    |    └─LlamaDecoderLayer: 3-4            123,910,400
|    |    └─LlamaDecoderLayer: 3-5            123,910,400
|    |    └─LlamaDecoderLayer: 3-6            123,910,400
|    |    └─LlamaDecoderLayer: 3-7            123,910,400
|    |    └─LlamaDecoderLayer: 3-8            123,910,400
|    |    └─LlamaDecoderLayer: 3-9            123,910,400
|    |    └─LlamaDecoderLayer: 3-10           123,910,400
|    |    └─LlamaDecoderLayer: 3-11           123,910,400
|    |    └─LlamaDecoderLayer: 3-12           123,910,400
|    |    └─LlamaDecoderLayer: 3-13           123,910,400
|    |    └─LlamaDecoderLayer: 3-14 

Layer (type:depth-idx)                        Param #
├─LlamaModel: 1-1                             --
|    └─Embedding: 2-1                         102,403,200
|    └─ModuleList: 2-2                        --
|    |    └─LlamaDecoderLayer: 3-1            123,910,400
|    |    └─LlamaDecoderLayer: 3-2            123,910,400
|    |    └─LlamaDecoderLayer: 3-3            123,910,400
|    |    └─LlamaDecoderLayer: 3-4            123,910,400
|    |    └─LlamaDecoderLayer: 3-5            123,910,400
|    |    └─LlamaDecoderLayer: 3-6            123,910,400
|    |    └─LlamaDecoderLayer: 3-7            123,910,400
|    |    └─LlamaDecoderLayer: 3-8            123,910,400
|    |    └─LlamaDecoderLayer: 3-9            123,910,400
|    |    └─LlamaDecoderLayer: 3-10           123,910,400
|    |    └─LlamaDecoderLayer: 3-11           123,910,400
|    |    └─LlamaDecoderLayer: 3-12           123,910,400
|    |    └─LlamaDecoderLayer: 3-13           123,910,400
|    |    └─LlamaDecoderLayer: 3-14 

### Guardar en archivo el modelo

In [None]:
#Solo mostraba la info limitada que se da desde colab 
import sys 
#Copy code
# Supongamos que 'model' es tu modelo PyTorch
# Especifica el nombre del archivo en el que deseas guardar la representación textual del modelo
archivo_guardado = 'modelo.txt'

# Abre el archivo en modo escritura y redirige la salida estándar hacia el archivo
with open(archivo_guardado, 'w') as archivo:
    original_stdout = sys.stdout  # Guarda la salida estándar original
    sys.stdout = archivo  # Redirige la salida estándar al archivo
    print(model)  # Imprime el modelo (la salida se redirige al archivo)
    sys.stdout = original_stdout  # Restaura la salida estándar original

# El modelo ha sido impreso en el archivo especificado

In [26]:
#pasaron mas de 1H y no finalizaba
import sys
from torchsummary import summary

# Supongamos que 'model' es tu modelo PyTorch
# Especifica el nombre del archivo en el que deseas guardar la información completa
archivo_guardado = 'summary_modelo.txt'

# Abre el archivo en modo escritura y redirige la salida estándar hacia el archivo
with open(archivo_guardado, 'w') as archivo:
    original_stdout = sys.stdout  # Guarda la salida estándar original
    sys.stdout = archivo  # Redirige la salida estándar al archivo
    summary(model, (input_size,))  # Genera y muestra la información del modelo
    sys.stdout = original_stdout  # Restaura la salida estándar original

# La información completa del modelo se guarda en el archivo especificado


## Lora

In [16]:
config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"], # https://github.com/huggingface/peft/blob/632997d1fb776c3cf05d8c2537ac9a98a7ce9435/src/peft/utils/other.py#L202
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)
lora_model = get_peft_model(model, config)

In [11]:
lora_model.eval()

PeftModel(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(32001, 3200)
        (layers): ModuleList(
          (0-25): 26 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(
                in_features=3200, out_features=3200, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3200, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3200, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=3200, out_features=3200, bias=False)
              (v_proj): Linear(
   

In [17]:
total_layers_lora = len(list(lora_model.modules()))
print(total_layers_lora)

788
