### llama3_plus_tfidf features
explore how the tfidf features can help with the classification problem


### generate tfidf features

In [1]:
# import related packages
import os
import random
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
from typing import Any, Dict, List, Optional
from sklearn.feature_extraction.text import TfidfVectorizer 

In [2]:
DATA_PATH = '../lmsys-chatbot-arena/'
TARGETS = ["winner_model_a", "winner_model_b", "winner_tie"]

In [3]:
# load data
train = pd.read_csv(DATA_PATH+'train.csv')

In [4]:
def _concat_turns(
    strs: List[Optional[str]],
    sep: str = ' ',
) -> str:
    """
    Concat Multiple turns of prompts/responses into a single string
    Args:
        strs: A list of strings.
        sep: the seperate character or word.

    Returns:
    The concatenated string
    """
    # get rid of None str
    stripped_strs=strs.strip('[]')
    sentence = [s.strip('"') for s in stripped_strs.split('","')]    
    cat_str = sep.join(sentence)
    return cat_str

def _cal_tf(word: str, doc: str) -> float:
    """ Calculate and return the term frequency."""
    tokens = doc.split()
    # term count
    tc = sum(1 for token in tokens if token == word)
    tf = tc/len(tokens)

    return tf

def _cal_idf(word: str, corpus: List[str]) -> float:
    """ Calculate and return the inverse document frequency."""
    n_w = 0
    for doc in corpus:
        if word in doc.split():
            n_w += 1

    idf = np.log(len(corpus)/(n_w+1))

    return idf

In [5]:
train["cat_prompt"] = train["prompt"].apply(lambda x: _concat_turns(x))
train["cat_res_a"] = train["response_a"].apply(lambda x: _concat_turns(x))
train["cat_res_b"] = train["response_b"].apply(lambda x: _concat_turns(x))
train.head(1)

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,cat_prompt,cat_res_a,cat_res_b
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0,Is it morally right to try to have a certain p...,The question of whether it is morally right to...,"As an AI, I don't have personal beliefs or opi..."


In [6]:
demo_corpus = train["cat_prompt"].iloc[:20].tolist()
for demo_word, doc_id in zip(["to", "I", "possibility"], [0, 3, 19]):
    print(f">>>Word \"{demo_word}\" <<<")
    print(f"## Doc {doc_id} ##")

    tf = _cal_tf(demo_word, demo_corpus[doc_id])
    idf = _cal_idf(demo_word, demo_corpus)
    tfidf = tf*idf

    print(f"TF {tf:.4f} | IDF {idf:.4f} | TF-IDF {tfidf:.4f}")
    print(f"-"*50)

>>>Word "to" <<<
## Doc 0 ##
TF 0.0690 | IDF 0.4308 | TF-IDF 0.0297
--------------------------------------------------
>>>Word "I" <<<
## Doc 3 ##
TF 0.1111 | IDF 1.6094 | TF-IDF 0.1788
--------------------------------------------------
>>>Word "possibility" <<<
## Doc 19 ##
TF 0.0175 | IDF 2.3026 | TF-IDF 0.0404
--------------------------------------------------


In [7]:
vectorizer_cfg = {
    "input": "content",
    "lowercase": True,
    "analyzer": "word",
    "ngram_range": (1,3),
    "max_df": 0.95,
    "min_df": 10,
    "max_features": 300,
    "smooth_idf": True,
    "sublinear_tf": True,
}

In [8]:
cols =  ["cat_prompt", "cat_res_a", "cat_res_b"]
vectorizers = {col:TfidfVectorizer(**vectorizer_cfg) for col in cols}

#generate tf-idf features
# X_tfidf = []
# for col, vectorizer in vectorizers.item():

In [9]:
X_tfidf = []
for col, vectorizer in vectorizers.items():
    x=vectorizer.fit_transform(train[col])
    X_tfidf.append(x.toarray())
X_tfidf = np.hstack(X_tfidf)

In [10]:
print(f"=== Inverse Documnet Frequency Vector ===")
for col, vec in vectorizers.items():
    print(f">>> {col} <<<")
    tmp = pd.DataFrame(vec.idf_, index=vec.get_feature_names_out()).sort_values(0, ascending=False)
    display(tmp.T)
    # break

=== Inverse Documnet Frequency Vector ===
>>> cat_prompt <<<


Unnamed: 0,t0,u2022,comment,int,id,enable,image,self,the user,class,...,that,you,for,what,and,is,in,of,to,the
0,9.251107,6.902912,6.331536,6.169197,6.145027,5.952804,5.688169,5.631221,5.625878,5.47758,...,2.60195,2.548574,2.407855,2.312888,2.01309,1.999952,1.985212,1.963364,1.915473,1.614227


>>> cat_res_a <<<


Unnamed: 0,u0438,u0442,u043e,u0430,u0435,u2022,de,string,company,energy,...,with,it,for,that,is,in,of,and,to,the
0,6.935277,6.915732,6.896563,6.890253,6.859291,5.565567,5.317975,4.75278,4.551233,4.402207,...,1.607401,1.604117,1.502849,1.480321,1.363749,1.334737,1.258118,1.224785,1.212251,1.152728


>>> cat_res_b <<<


Unnamed: 0,u0438,u0442,u043d,u0435,u0430,u043e,u2022,de,string,company,...,with,it,for,that,is,in,of,and,to,the
0,6.915732,6.877753,6.859291,6.841164,6.841164,6.823359,5.555583,5.328474,4.762471,4.526082,...,1.604658,1.595717,1.505787,1.479,1.361398,1.3352,1.254409,1.225744,1.208837,1.154899


In [11]:
prompt_feats = vectorizers["cat_prompt"].get_feature_names_out()
res_a_feats = vectorizers["cat_res_a"].get_feature_names_out()
res_b_feats = vectorizers["cat_res_b"].get_feature_names_out()

In [12]:
# prepare data
tfidf_feats = (
    [f"{feat}(prompt)" for feat in prompt_feats]
    + [f"{feat}(res_a)" for feat in res_a_feats]
    + [f"{feat}(res_b)" for feat in res_b_feats]
)
X = pd.DataFrame(X_tfidf, columns = tfidf_feats)

In [13]:
X.head(2)

Unnamed: 0,10(prompt),100(prompt),20(prompt),about(prompt),about the(prompt),act(prompt),action(prompt),add(prompt),after(prompt),ai(prompt),...,world(res_b),would(res_b),would be(res_b),years(res_b),you(res_b),you are(res_b),you can(res_b),you have(res_b),you re(res_b),your(res_b)
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.107871,0.0,0.0,0.0,0.061752,0.0,0.0,0.0,0.0,0.079803
1,0.201281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.118866,0.0,0.068789,0.074145,0.0,0.131302


In [14]:
# Example. Tested that idf is calculated as np.log((nt+1)/(nd+1))+1
# assign documents
d0 = 'Geeks for geeks'
d1 = 'Geeks'
d2 = 'r2j Geeks'
d3 = 'aaa '

# merge documents into a single corpus
string = [d0, d1, d2, d3]

# create object
tfidf = TfidfVectorizer()

# get tf-df values
result = tfidf.fit_transform(string)

# get idf values
print('\nidf values:')
for ele1, ele2 in zip(tfidf.get_feature_names_out(), tfidf.idf_):
	print(ele1, ':', ele2)




idf values:
aaa : 1.916290731874155
for : 1.916290731874155
geeks : 1.2231435513142097
r2j : 1.916290731874155


# Create inference model based on llama3
In the training network, we hope to combine the llama3 with the above tfidf features

In [15]:
import torch
import transformers
from sklearn.metrics import accuracy_score

# finetuning related modules
from transformers import AutoTokenizer, LlamaModel, LlamaForSequenceClassification
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
# end

import torch.nn.functional as F
import torch.nn as nn

In [16]:
class CFG:
    NUM_EPOCHS = 1
    BATCH_SIZE = 2
    DROPOUT = 0.05
    MODEL_NAME = "../llama3/Meta-Llama-3-8B/"
    SEED = 2024
    MAX_LENGTH = 128 # truncate the input to save memory, toy implementation only
    NUM_WARMUP_STEPS = 4 # toy implementation. 
    LR_MAX = 5E-5
    NUM_CLASS_LLAMA = 128 # with the llama model, we hope it can generate 128 features, which are combined with tfidf features.
    NUM_LABELS = 3 # The final number of labels
    LORA_RANK = 1 # Toy implementation 
    LORA_ALPHA = 2 # toy implementation
    LORA_MODULES = ['o_proj', 'v_proj']

if torch.cuda.is_available():
    DEVICE = 'cuda'
    print('GPU is used')
else:
    DEVICE = 'cpu'
    print('CPU is used')

GPU is used


In [17]:
# Clear GPU memory
torch.cuda.empty_cache()

In [18]:
# set seed to ensure reproducibility
def set_seeds(seed):
    os.environ['PYTHONHASHSEED']=str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seeds(seed=CFG.SEED)

### Tokenizer

In [19]:
tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_size = 'right'
tokenizer.add_eos_token = True
tokenizer.tokenize("shuo bu shuo!")

tokenizer.save_pretrained('tokenizer')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\tokenizer.json')

In [20]:
# utility function giving token length
# only takes data frame input?
def get_token_lengths(texts):
    # tokenize and receive inputs_ids for each text
    inputs_ids = tokenizer(texts.tolist(),return_tensors='np')['input_ids']
    # input_ids = tokenizer(texts.tolist(), return_tensors='np')['input_ids']
    # return length of inputs_ids for each text
    return [len(t) for t in inputs_ids]

### Create training data

In [21]:
# Drop 'Null' for training
condition = (train.cat_res_a=='null') & (train.cat_res_b=='null')
indexes = train[condition].index
# print(indexes)
train.drop(indexes, inplace=True)
train.reset_index(inplace=True, drop=True)

print(f"Total {len(indexes)} Null reponse rows dropped")
print('Total train samples: ', len(train))

Total 19 Null reponse rows dropped
Total train samples:  57458


In [22]:
# Show the conversation
train['text']='User prompt: '+train['cat_prompt']+ '\n\nModel A:\n'+train['cat_res_a']+'\n\n--------\n\nModel B:\n'+train['cat_res_b']
print(train['text'][4])

User prompt: What is the best way to travel from Tel-Aviv to Jerusalem? Car? Bus? Plane?

Model A:
The best way to travel from Tel Aviv to Jerusalem depends on your personal preference and the availability of transportation options. All the options you have mentioned are valid options, but here are some details to help you make your decision:\n\n*   By car: Traveling by car is the quickest way to get from Tel Aviv to Jerusalem, as the distance between the two cities is only about 60 kilometers (37 miles). It takes around 45 minutes to drive from Tel Aviv to Jerusalem by car, depending on the traffic.\n*   By bus: There are several bus lines that run from Tel Aviv to Jerusalem, and the journey takes around 1 hour and 30 minutes by bus. The buses are comfortable and reliable, and they offer a scenic view of the beautiful Israeli countryside.\n*   By plane: There are no direct flights from Tel Aviv to Jerusalem, so you need to take a flight from Tel Aviv's Ben Gurion International Airport

In [23]:
# Train only 10% train dataset for toy implementation
train = train[:int(len(train)/10)]
texts = train['text']
train.loc[:,'token_count'] = get_token_lengths(texts)

# prepare label for model
train.loc[:, 'label']=np.argmax(train[['winner_model_a','winner_model_b','winner_tie']].values, axis=1)
display(train.head())

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,cat_prompt,cat_res_a,cat_res_b,text,token_count,label
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0,Is it morally right to try to have a certain p...,The question of whether it is morally right to...,"As an AI, I don't have personal beliefs or opi...",User prompt: Is it morally right to try to hav...,1206,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0,What is the difference between marriage licens...,A marriage license is a legal document that al...,A marriage license and a marriage certificate ...,User prompt: What is the difference between ma...,1393,1
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1,explain function calling. how would you call a...,Function calling is the process of invoking or...,Function calling is the process of invoking a ...,User prompt: explain function calling. how wou...,664,2
3,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0,How can I create a test set for a very rare ca...,Creating a test set for a very rare category c...,When building a classifier for a very rare cat...,User prompt: How can I create a test set for a...,1008,0
4,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0,What is the best way to travel from Tel-Aviv t...,The best way to travel from Tel Aviv to Jerusa...,The best way to travel from Tel-Aviv to Jerusa...,User prompt: What is the best way to travel fr...,479,1


In [24]:
train.label.value_counts()

label
0    2018
1    1968
2    1759
Name: count, dtype: int64

In [25]:
# token Count
display(train['token_count'].describe().to_frame())

Unnamed: 0,token_count
count,5745.0
mean,730.840557
std,821.874756
min,18.0
25%,282.0
50%,553.0
75%,892.0
max,15428.0


In [26]:
# get length of tokens which covers 90% of data, we'll still take 128 length in this example for toy implementation
np.percentile(train['token_count'],90)

1392.0

#### Tokenize train data

In [27]:
print(CFG.MAX_LENGTH)
a = tokenizer.tokenize("I have 0.63619374 apple",padding='max_length',max_length=10,truncation=True)
a

128


['I', 'Ġhave', 'Ġ', '0', '.', '636', '193', '74', 'Ġapple', '<|end_of_text|>']

In [28]:
# tokenize data
tokens = tokenizer(
    train['text'].tolist(),
    padding='max_length',
    max_length=CFG.MAX_LENGTH,
    truncation=True,
    return_tensors='np'
)

# Input IDs are teh token IDs
INPUT_IDS = tokens['input_ids']
# Attention Masks to Ignore Padding Tokens
ATTENTION_MASKS = tokens['attention_mask']
# Label of Texts
LABELS = train[['winner_model_a','winner_model_b','winner_tie']].values

print(f'INPUT_IDS shape: {INPUT_IDS.shape}, ATTENTION_MASKS shape: {ATTENTION_MASKS.shape}')
print(f'LABELS shape: {LABELS.shape}')

INPUT_IDS shape: (5745, 128), ATTENTION_MASKS shape: (5745, 128)
LABELS shape: (5745, 3)


In [29]:
type(X_tfidf)

numpy.ndarray

In [30]:
# train_dataset function, create batch and return input_ids, attention_mask, X_feat(features), and labels
def train_dataset(batch_size):
    N_SAMPLES = LABELS.shape[0]
    IDXS = np.arange(N_SAMPLES - (N_SAMPLES % batch_size))
    while True:
        # Shuffle Indices
        np.random.shuffle(IDXS)
        # Iterate Over All Indices Once
        for idxs in IDXS.reshape(-1, batch_size):
            input_ids = torch.tensor(INPUT_IDS[idxs]).to(DEVICE)
            attention_mask = torch.tensor(ATTENTION_MASKS[idxs]).to(DEVICE)
            labels = torch.tensor(LABELS[idxs]).to(DEVICE) # Multi-label
            X_feat = torch.tensor(X_tfidf[idxs]).to(DEVICE)
            # yield returns a returns a generator object to 
            # the one who calls the function which contains 
            # yield, instead of simply returning a value
            yield input_ids, attention_mask, X_feat, labels
            
TRAIN_DATASET = train_dataset(CFG.BATCH_SIZE)

In [31]:
## test the above function
test_a, test_b, test_c, test_d = next(TRAIN_DATASET)
print(test_a.shape, test_b.shape, test_c.shape, test_d.shape)

torch.Size([2, 128]) torch.Size([2, 128]) torch.Size([2, 900]) torch.Size([2, 3])


### Load Model

In [32]:
# Load model for classification with 3 target label
base_model = LlamaForSequenceClassification.from_pretrained(
    CFG.MODEL_NAME,
    num_labels=CFG.NUM_CLASS_LLAMA,
    torch_dtype = torch.bfloat16)

base_model.config.pretraining_tp = 1

# Assign Padding TOKEN
base_model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ../llama3/Meta-Llama-3-8B/ and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Low-Rank Adaption [LORA]

In [33]:
CFG.LORA_RANK

1

In [34]:
lora_config = LoraConfig(
    r=CFG.LORA_RANK, # the dimension of the low-rank matrices
    lora_alpha = CFG.LORA_ALPHA, # scaling factor for LoRA activations vs pre-trained weight activations
    lora_dropout = CFG.DROPOUT,
    bias='none',
    inference_mode=False,
    task_type = TaskType.SEQ_CLS, # refer to https://github.com/huggingface/peft/blob/v0.8.2/src/peft/utils/peft_types.py#L68-L73 for the TaskType Class
    target_modules = CFG.LORA_MODULES # Only use Output and Values Projection
)

In [35]:
# Exciting!!!!!!!!!!!!!!!!!!!!!!!!
# Create LoRa Model
model = get_peft_model(base_model, lora_config)
# Trainable Parameters
model.print_trainable_parameters()

In [36]:
# torch.cuda.empty_cache()
import gc
gc.collect()

In [37]:
# verify teh trainable layers
MODEL_LAYERS_ROWS = []
TRAINABLE_PARAMS = []
N_TRAINABLE_PARAMS = 0

for name, param in model.named_parameters():
    # Layer Parameter Count
    n_parameters = int(torch.prod(torch.tensor(param.shape)))
    # Only Trainable Layers
    if param.requires_grad:
        # Add Layer Information
        MODEL_LAYERS_ROWS.append({
            'param': n_parameters,
            'name': name,
            'dtype': param.data.dtype,
        })
        # Append Trainable Parameter
        TRAINABLE_PARAMS.append({'params': param})
        # Add Number of Trainable Parameters
        N_TRAINABLE_PARAMS += n_parameters

display(pd.DataFrame(MODEL_LAYERS_ROWS))

print(f"""
===============================
N_TRAINABLE_PARAMS: {N_TRAINABLE_PARAMS:,}
N_TRAINABLE_LAYERS: {len(TRAINABLE_PARAMS)}
===============================
""")

### Create tf-idf feature extraction model


In [38]:
# class model_TFIDF(nn.Module):
#     def __init__(self):
#         super(model_TFIDF,self).__init__()
#         self.conv1 = nn.Sequential(
#             nn.Conv1d(1, 8, kernel_size=3, stride = 1, bias = True),
#             nn.ReLU()
#         )
#         self.conv2 = nn.Sequential(
#             nn.Conv1d(8, 16, kernel_size = 3, stride = 1, bias = True),
#             nn.ReLU()
#         )
#         self.linear = nn.Sequential(
#             nn.Linear(16 * 896, CFG.NUM_CLASS_LLAMA, bias = True),
#             nn.ReLU()
#         )

#     def forward(self,in_feat):
#         x = self.conv1(in_feat)
#         x = self.conv2(x)
#         x = x.view(-1,16*896)
#         x = self.linear(x)
#         return x

# model_tfidf = model_TFIDF()

In [39]:
# # test model
# model_tfidf = model_TFIDF()
# output = model_tfidf(test_c.view(2,1,900).to(torch.float32).cpu())

In [116]:
# class compound_model(nn.Module):
#     def __init__(self):
#         super(compound_model,self).__init__()
#         self.model_tfidf = model_TFIDF()#.to(DEVICE)
#         self.model_lora = get_peft_model(base_model, lora_config)
#         self.model_lora#.to(DEVICE)
#         self.rt = nn.Sequential(
#             nn.Linear(CFG.NUM_CLASS_LLAMA * 2, 128),
#             nn.ReLU()
#         )#.to(DEVICE)
#         self.decide = nn.Linear(128, 3).to(DEVICE)
#         self.softmax = nn.Softmax(dim = 1)#.to(DEVICE)

#     def forward(self, _input_token, _input_mask, _input_feat):
#         output_llama = self.model_lora(_input_token, _input_mask)
#         output_llama_logits = output_llama.logits
#         output_tfidf = self.model_tfidf(_input_feat)
#         # print(output_llama_logits.shape)
#         # print(output_tfidf.shape)
#         output_compound = torch.cat((output_llama_logits, output_tfidf),dim=-1)
#         # print(output_compound.shape)
#         output = self.rt(output_compound)
#         output = self.decide(output)
#         print(output.shape)
#         output = self.softmax(output)
#         return output



In [117]:
# model_compound = compound_model()

In [118]:
# torch.cuda.empty_cache()

In [119]:
# out = model_compound(test_a, test_b, test_c.view(2,1,900).to(torch.float32))

In [120]:
# out

## Training

In [121]:
# learning rate and optimizer
N_SAMPLES = len(train)
STEPS_PER_EPOCH = N_SAMPLES // CFG.BATCH_SIZE

OPTIMIZER=torch.optim.AdamW(model_compound.parameters(), lr = CFG.LR_MAX)

# Cosine Learning Rate with Warmup
lr_scheduler = transformers.get_cosine_schedule_with_warmup(
    optimizer=OPTIMIZER,
    num_warmup_steps=CFG.NUM_WARMUP_STEPS,
    num_training_steps=STEPS_PER_EPOCH * CFG.NUM_EPOCHS
)

print(f'BATCH_SIZE: {CFG.BATCH_SIZE}, N_SAMPLES: {N_SAMPLES}, STEPS_PER_EPOCH: {STEPS_PER_EPOCH}')

BATCH_SIZE: 2, N_SAMPLES: 5745, STEPS_PER_EPOCH: 2872


In [122]:
# Set the data type for the optimizer's state(e.g., momentum buffers)
for state in OPTIMIZER.state.values():
    for k,v in state.items():
        if isinstance(v, torch.Tensor) and state[k].dtype is not torch.float32:
            state[v] = v.to(dtype=torch.float32)
            

In [123]:
input_ids, attention_mask, input_feat, labels = next(TRAIN_DATASET)
print(f'input_ids shape: {input_ids.shape}, dtype: {input_ids.dtype}')
print(f'attention_mask shape: {attention_mask.shape}, dtype: {attention_mask.dtype}')
print(f'input_feat shape: {input_feat.shape}, dtype: {input_feat.dtype}')
print(f'labels shape: {labels.shape}, dtype: {labels.dtype}')

input_ids shape: torch.Size([2, 128]), dtype: torch.int32
attention_mask shape: torch.Size([2, 128]), dtype: torch.int32
input_feat shape: torch.Size([2, 900]), dtype: torch.float64
labels shape: torch.Size([2, 3]), dtype: torch.int64


In [124]:
model_compound.to(DEVICE)

compound_model(
  (model_tfidf): model_TFIDF(
    (conv1): Sequential(
      (0): Conv1d(1, 8, kernel_size=(3,), stride=(1,))
      (1): ReLU()
    )
    (conv2): Sequential(
      (0): Conv1d(8, 16, kernel_size=(3,), stride=(1,))
      (1): ReLU()
    )
    (linear): Sequential(
      (0): Linear(in_features=14336, out_features=128, bias=True)
      (1): ReLU()
    )
  )
  (model_lora): PeftModelForSequenceClassification(
    (base_model): LoraModel(
      (model): LlamaForSequenceClassification(
        (model): LlamaModel(
          (embed_tokens): Embedding(128256, 4096)
          (layers): ModuleList(
            (0-31): 32 x LlamaDecoderLayer(
              (self_attn): LlamaSdpaAttention(
                (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
                (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=4096, out_features=1024, bias=False)
    

In [131]:
%%time
# Dummy Prediction
print(input_ids.get_device(), attention_mask.get_device())
with torch.no_grad():
    outputs = model_compound(input_ids, attention_mask, input_feat.view(2,1,900).to(torch.float32))

print(f'logits: {outputs}, dtype: {outputs.dtype}')

0 0
torch.Size([2, 3])
logits: tensor([[0.2477, 0.3379, 0.4144],
        [0.4880, 0.3668, 0.1452]], device='cuda:0'), dtype: torch.float32
CPU times: total: 2.77 s
Wall time: 6.54 s


In [126]:
outputs

tensor([[0.3658, 0.1850, 0.4492],
        [0.4806, 0.1494, 0.3700]], device='cuda:0')

In [127]:
# put model in train_mode
model_compound.train()

# loss function, cross entropy
LOSS_FN = torch.nn.CrossEntropyLoss().to(dtype=torch.float32)

In [128]:
from time import time
import warnings
from tqdm.auto import tqdm

In [130]:
st = time()
warnings.filterwarnings("error")
METRICS={
    'loss':[],
    'accuracy': {'y_true':[], 'y_pred':[]}
}

for epoch in tqdm(range(CFG.NUM_EPOCHS)):
    ste = time()
    for step in range(STEPS_PER_EPOCH):
        # Zero out gradients
        OPTIMIZER.zero_grad()

        # get batch
        input_ids, attention_mask, input_feat, labels = next(TRAIN_DATASET)

        # forward pass
        outputs = model_compound(input_ids, attention_mask, input_feat.view(CFG.BATCH_SIZE,1,900).to(torch.float32))

        # logits float32
        logits = outputs.to(dtype=torch.float32)

        # backward pass
        loss = LOSS_FN(logits, labels.to(dtype=torch.float32))
        loss.backward()

        # optimizer step
        OPTIMIZER.step()

        # update learning rate scheduler
        lr_scheduler.step()

        METRICS['loss'].append(float(loss))
        METRICS['accuracy']['y_true'] += labels.squeeze().tolist()
        METRICS['accuracy']['y_pred'] += torch.argmax(F.softmax(logits,dim=-1), dim=1).cpu().tolist()
        print(f'step: {step}')
        if (step+1)%20 == 0:
            metrics = 'mu_loss: {:.3f}'.format(np.mean(METRICS['loss']))
            metrics += ', step_loss: {:.3f}'.format(METRICS['loss'][-1])
            metrics += ', mu_auc: {:.3f}'.format(accuracy_score(torch.argmax(torch.tensor(METRICS['accuracy']['y_true']), axis=-1), \
                        METRICS['accuracy']['y_pred']))
            lr = OPTIMIZER.param_groups[0]['lr']
            print(f'{epoch+1:02}/{CFG.NUM_EPOCHS:02} | {step+1:04}/{STEPS_PER_EPOCH} lr: {lr:.2E}, {metrics}', end='')
            print(f'\nSteps per epoch: {step+1} complete | Time elapsed: {time() - st}')

    print(f'\nEpoch {epoch+1} Completed | Total time for epoch: {time()-ste} ')
    

  0%|          | 0/1 [00:00<?, ?it/s]

torch.Size([2, 3])
step: 0
torch.Size([2, 3])
step: 1
torch.Size([2, 3])
step: 2
torch.Size([2, 3])
step: 3
torch.Size([2, 3])
step: 4
torch.Size([2, 3])
step: 5
torch.Size([2, 3])
step: 6
torch.Size([2, 3])
step: 7
torch.Size([2, 3])
step: 8
torch.Size([2, 3])
step: 9
torch.Size([2, 3])
step: 10
torch.Size([2, 3])
step: 11
torch.Size([2, 3])
step: 12
torch.Size([2, 3])
step: 13
torch.Size([2, 3])
step: 14
torch.Size([2, 3])
step: 15
torch.Size([2, 3])
step: 16
torch.Size([2, 3])
step: 17
torch.Size([2, 3])
step: 18
torch.Size([2, 3])
step: 19
01/01 | 0020/2872 lr: 4.93E-05, mu_loss: 1.124, step_loss: 1.259, mu_auc: 0.225
Steps per epoch: 20 complete | Time elapsed: 243.59108686447144
torch.Size([2, 3])
step: 20
torch.Size([2, 3])
step: 21
torch.Size([2, 3])
step: 22
torch.Size([2, 3])
step: 23
torch.Size([2, 3])
step: 24
torch.Size([2, 3])
step: 25
torch.Size([2, 3])
step: 26
torch.Size([2, 3])
step: 27
torch.Size([2, 3])
step: 28
torch.Size([2, 3])
step: 29
torch.Size([2, 3])
step: 3

KeyboardInterrupt: 

In [94]:
input_feat.shape

torch.Size([2, 900])