In [1]:
!huggingface-cli login --token XYZ

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /home/studio-lab-user/.cache/huggingface/token
Login successful


In [2]:
!python --version

Python 3.9.16


In [3]:
!pip install pandas



In [4]:
!pip3 install torch torchvision torchaudio



In [5]:
!pip install scikit-learn



In [6]:
!pip install transformers



In [7]:
!pip install torchmetrics==0.11.4



In [8]:
!pip install peft



In [1]:
import pandas as pd
import torch
from torch import nn
import numpy as np
import re
from transformers import LlamaTokenizer, LlamaForCausalLM
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, LlamaForSequenceClassification
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
)
from tqdm import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchmetrics import Accuracy
from sklearn.model_selection import train_test_split 

In [2]:
seed = 1

## Dataset

In [36]:
filenameDataset = "IMDBDatasetSPANISH.csv"
data = pd.read_csv(filenameDataset)
data.head()

Unnamed: 0.1,Unnamed: 0,review_en,review_es,sentiment,sentimiento
0,0,One of the other reviewers has mentioned that ...,Uno de los otros críticos ha mencionado que de...,positive,positivo
1,1,A wonderful little production. The filming tec...,Una pequeña pequeña producción.La técnica de f...,positive,positivo
2,2,I thought this was a wonderful way to spend ti...,Pensé que esta era una manera maravillosa de p...,positive,positivo
3,3,Basically there's a family where a little boy ...,"Básicamente, hay una familia donde un niño peq...",negative,negativo
4,4,"Petter Mattei's ""Love in the Time of Money"" is...","El ""amor en el tiempo"" de Petter Mattei es una...",positive,positivo


In [37]:
data.isnull().sum()

Unnamed: 0     0
review_en      0
review_es      0
sentiment      0
sentimiento    0
dtype: int64

In [38]:
def to_sentiment(sentiment):    
    sentiment = str(sentiment)    
    if sentiment == 'positive':
        return 1
    elif sentiment == 'negative':
        return 0
    else:
        return 2

In [39]:
#Check spanish
def selectLanguage(language, data):
    if language == "spanish":
        data['is_positive'] = data.sentiment.apply(to_sentiment)
        del data['review_en']
        del data['sentimiento']
        del data['sentiment']
        data.columns = ['id_review', 'text', 'sentiment']
    elif language == "english":
        del data['review_es']
        del data['sentimiento']
        del data[data.columns[0]]
        data.rename(columns={'review_en': 'text'}, inplace=True)
        data['sentiment'] = data.sentiment.apply(to_sentiment)
    return data

In [40]:
def basic_cleaning(text):
    text=re.sub(r'https?://www\.\S+\.com','',text)
    text=re.sub(r'[^A-Za-z|\s]','',text)
    text=re.sub(r'\*+','swear',text)
    return text

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F" #emoticons
                               u"\U0001F300-\U0001F5FF" #symbols & pictographs
                               u"\U0001F680-\U0001F6FF" #transport & map symbols
                               u"\U0001F1E0-\U0001F1FF" #flags (iOS)
                               u"\U00002702-\U000027B0" 
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_multiplechars(text):
    text = re.sub(r'(.)\1{3,}',r'\1', text)
    return text

def clean(df):
    for col in ['text']:#,'selected_text']:
        df[col]=df[col].astype(str).apply(lambda x:basic_cleaning(x))
        df[col]=df[col].astype(str).apply(lambda x:remove_emoji(x))
        df[col]=df[col].astype(str).apply(lambda x:remove_html(x))
        df[col]=df[col].astype(str).apply(lambda x:remove_multiplechars(x))
    return df

In [41]:
idioma = "english"
data = selectLanguage(idioma, data)

In [42]:
data.head()

Unnamed: 0,text,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. The filming tec...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [None]:
print(data.sentiment.value_counts())
print(len(data.sentiment))

In [43]:
commentDFTrain, commentDFVal = train_test_split(data, test_size=0.2, random_state=seed) 

In [44]:
commentDFTrain

Unnamed: 0,text,sentiment
18165,This film has been compared to the hilarious B...,0
36059,Reasonably effective horror/science-fiction a ...,1
13242,"The inspiration for the ""Naked Gun"" movies cas...",1
32985,When this film was originally released it was ...,1
41133,I happened upon this by chance. I was at my fr...,1
...,...,...
43723,The larger-than-life figures of Wyatt Earp and...,1
32511,"Okay, you have:Penelope Keith as Miss Herringb...",0
5192,"An odd, willfully skewed biopic of Dyan Thomas...",0
12172,"Basic structure of a story: Beginning, Middle,...",0


In [45]:
commentDFVal

Unnamed: 0,text,sentiment
26247,With No Dead Heroes you get stupid lines like ...,0
35067,I thought maybe... maybe this could be good. A...,0
34590,An elite American military team which of cours...,0
16668,Ridiculous horror film about a wealthy man (Jo...,0
12196,"Well, if you are one of those Katana's film-nu...",1
...,...,...
49858,I liked this movie. That's pretty much all I c...,1
19849,"Imagine the plight of Richard, a painter, whos...",1
46899,There is an inherent problem with commenting o...,0
28256,When The Matrix appeared in 1999 and questione...,1


In [48]:
commentDFTrain_clean = clean(commentDFTrain)
commentDFVal_clean = clean(commentDFVal)

In [49]:
commentDFTrain_clean

Unnamed: 0,text,sentiment
18165,This film has been compared to the hilarious B...,0
36059,Reasonably effective horrorsciencefiction a la...,1
13242,The inspiration for the Naked Gun movies casts...,1
32985,When this film was originally released it was ...,1
41133,I happened upon this by chance I was at my fri...,1
...,...,...
43723,The largerthanlife figures of Wyatt Earp and B...,1
32511,Okay you havePenelope Keith as Miss Herringbon...,0
5192,An odd willfully skewed biopic of Dyan Thomas ...,0
12172,Basic structure of a story Beginning Middle En...,0


In [50]:
commentDFVal_clean

Unnamed: 0,text,sentiment
26247,With No Dead Heroes you get stupid lines like ...,0
35067,I thought maybe maybe this could be good An ea...,0
34590,An elite American military team which of cours...,0
16668,Ridiculous horror film about a wealthy man Joh...,0
12196,Well if you are one of those Katanas filmnuts ...,1
...,...,...
49858,I liked this movie Thats pretty much all I can...,1
19849,Imagine the plight of Richard a painter whose ...,1
46899,There is an inherent problem with commenting o...,0
28256,When The Matrix appeared in and questioned ex...,1


In [51]:
def selectTexts(dataSet, words):
    list_text= []
    list_label= []
    for (text, target) in zip(dataSet.text, dataSet.sentiment):
        if (len(text.split())) <= words:
            list_text.append(text)
            list_label.append(target)
    DFTrain_clean = list(zip(list_text, list_label))
    DFTrain_clean = pd.DataFrame(DFTrain_clean, columns=['text','sentiment'])
    return DFTrain_clean 

In [52]:
commentDFTrain_clean = selectTexts(commentDFTrain_clean, 80)

In [53]:
commentDFTrain_clean

Unnamed: 0,text,sentiment
0,Reasonably effective horrorsciencefiction a la...,1
1,This movie got off to an interesting start Dow...,0
2,The whole movie seemed to suffer from poor edi...,0
3,This film is utterly amazing From the performa...,1
4,In my opinion this film has wonderful lighting...,1
...,...,...
3461,Many funny scenes about the people that you do...,1
3462,This is the movie for those who believe cinema...,1
3463,A clever and bizarre angle to Beauty is in the...,1
3464,This is an excellent filmTom Hanks and Paul Ne...,1


In [54]:
commentDFVal_clean = selectTexts(commentDFVal_clean, 80)

In [55]:
commentDFVal_clean

Unnamed: 0,text,sentiment
0,You want to know what the writers of this movi...,0
1,This was a very well scripted movie Great fun ...,1
2,Ive had to change my view on the worst film in...,0
3,The whole world is falling prey to a lethal di...,1
4,Prince of Central Park is so utterly bad It w...,0
...,...,...
881,Why did this movie fail commercially Its got a...,1
882,This is the second film Ive seen of Ida Lupino...,1
883,I loved this movie I saw it when I was about ...,1
884,We watched this movie in my chemistry class so...,1


In [56]:
class CustomDataset(Dataset):
    def __init__(self, commentDF):
        self.commentDF = commentDF

    def __len__(self):
        return len(self.commentDF.index)

    def __getitem__(self, idx):
        return np.array([idx])

In [57]:
training_data = CustomDataset(commentDFTrain_clean)
validation_data = CustomDataset(commentDFVal_clean)

train_dataloader = DataLoader(training_data, batch_size=4, shuffle=True)
val_dataloader = DataLoader(validation_data, batch_size=4, shuffle=False)


## Modelo

In [24]:
model_path = 'openlm-research/open_llama_3b' 
num_labels = 2 
model = LlamaForSequenceClassification.from_pretrained(model_path, num_labels=num_labels, torch_dtype=torch.float16, device_map='auto')
model = model.to('cuda') #
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True,padding_side="left",pad_token="<|endoftext|>")
model.resize_token_embeddings(len(tokenizer))

Some weights of the model checkpoint at openlm-research/open_llama_3b were not used when initializing LlamaForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at openlm-research/open_llama_3b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associ

Embedding(32001, 3200)

In [25]:
config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)
lora_model = get_peft_model(model, config)

In [58]:
def training_validation(epochs, model, train_dataloader, val_dataloader, commentDFTrain_clean, commentDFVal_clean):
    #epochs = 4
    # Let's just train on a small batch of the dataset and see the validation score
    earlyStopAtTrainBatchIdx = 100 # Set these variables to None to train and validate on all the dataset
    earlyStopAtValBatchIdx = 20 # Set these variables to None to train and validate on all the dataset
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
    loss_fn = nn.CrossEntropyLoss()
    counter = 1
    for epoch in range(epochs):
        print("epochs: ", counter)
        for batchIdx, sampledIdx in enumerate(tqdm(train_dataloader, position=0, leave=True)):
            sampledIdx = sampledIdx.cpu().data.numpy()
            model.train()
            optimizer.zero_grad()
            
            sampledRowText = list(commentDFTrain_clean["text"].iloc[list(sampledIdx.flatten())])
            sampledRowLabels = torch.tensor(list(commentDFTrain_clean["sentiment"].iloc[list(sampledIdx.flatten())])).to("cuda")
            encoded_input = tokenizer(sampledRowText, truncation=True, padding=True, return_tensors='pt').to("cuda") # Output shape: [bs, num_Labels]
            encoded_inputIds = encoded_input["input_ids"].to("cuda")
            encoded_attnMask = encoded_input["attention_mask"].to("cuda")
            outputs = model(input_ids=encoded_inputIds, attention_mask=encoded_attnMask)
            logits = outputs.logits
            loss = loss_fn(logits.squeeze(), sampledRowLabels)
            loss.backward()
            optimizer.step()
            
            if earlyStopAtTrainBatchIdx!=None:
                if batchIdx==earlyStopAtTrainBatchIdx: break
        counter += 1 
        # Validation
        predLs = []
        labelLs = []
        textLs = [] 
        for batchIdx, sampledIdx in enumerate(val_dataloader):
            model.eval()
            
            sampledRowText = list(commentDFVal_clean["text"].iloc[list(sampledIdx.flatten())])
            sampledRowLabels = torch.tensor(list(commentDFVal_clean["sentiment"].iloc[list(sampledIdx.flatten())]))
            encoded_input = tokenizer(sampledRowText, truncation=True, padding=True, return_tensors='pt').to("cuda") # Output shape: [bs, num_Labels]
            encoded_inputIds = encoded_input["input_ids"].to("cuda")
            encoded_attnMask = encoded_input["attention_mask"].to("cuda")
            outputs = model(input_ids=encoded_inputIds, attention_mask=encoded_attnMask)
            logits = outputs.logits
            predLs.append(torch.argmax(logits, dim=1).flatten().cpu().data.numpy())
            labelLs.append(sampledRowLabels.cpu().data.numpy())
            
            textLs.append(sampledRowText)
            if earlyStopAtValBatchIdx!=None:
                if batchIdx==earlyStopAtValBatchIdx: break
        predLs = torch.tensor(predLs).flatten()
        labelLs = torch.tensor(labelLs).flatten()
        accuracy = Accuracy(task="multiclass", num_classes=2)
        valAcc = float(accuracy(predLs, labelLs))
        print("Review sentiment validation accuracy: ", valAcc)
    return predLs, labelLs, valAcc 

### Resultados

#### 10 epocas

In [28]:
epochs = 10 
prediction, real, acc = training_validation(epochs, model, train_dataloader, val_dataloader, commentDFTrain_clean, commentDFVal_clean)

epochs:  1


 12%|█▏        | 100/867 [00:30<03:57,  3.23it/s]
  predLs = torch.tensor(predLs).flatten()


Review sentiment validation accuracy:  0.4285714328289032
epochs:  2


 12%|█▏        | 100/867 [00:28<03:36,  3.54it/s]


Review sentiment validation accuracy:  0.511904776096344
epochs:  3


 12%|█▏        | 100/867 [00:28<03:39,  3.50it/s]


Review sentiment validation accuracy:  0.7023809552192688
epochs:  4


 12%|█▏        | 100/867 [00:28<03:40,  3.47it/s]


Review sentiment validation accuracy:  0.8333333134651184
epochs:  5


 12%|█▏        | 100/867 [00:28<03:41,  3.46it/s]


Review sentiment validation accuracy:  0.9047619104385376
epochs:  6


 12%|█▏        | 100/867 [00:28<03:42,  3.45it/s]


Review sentiment validation accuracy:  0.9047619104385376
epochs:  7


 12%|█▏        | 100/867 [00:29<03:43,  3.43it/s]


Review sentiment validation accuracy:  0.9523809552192688
epochs:  8


 12%|█▏        | 100/867 [00:28<03:38,  3.52it/s]


Review sentiment validation accuracy:  0.9404761791229248
epochs:  9


 12%|█▏        | 100/867 [00:29<03:44,  3.42it/s]


Review sentiment validation accuracy:  0.9523809552192688
epochs:  10


 12%|█▏        | 100/867 [00:28<03:40,  3.48it/s]


Review sentiment validation accuracy:  0.9523809552192688


In [29]:
print("prediction", prediction)

prediction tensor([0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
        1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1,
        0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1])


In [30]:
print("real", real)

real tensor([0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
        1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1,
        0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
        1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1])


In [31]:
print("acc", acc)

acc 0.9523809552192688
