In [2]:
!pip install datasets --quiet

!pip install evaluate --quiet

!pip install py7zr --quiet

!pip install accelerate -U --quiet

!pip install rouge_score --quiet

import os
os.environ['TORCH_USE_CUDA_DSA'] = '1'

In [3]:
import os
os.environ['TORCH_USE_CUDA_DSA'] = '1'
print(os.environ['TORCH_USE_CUDA_DSA'])

1


In [4]:
import torch
torch.cuda.is_available()

False

In [5]:
from datasets import load_dataset

dataset = load_dataset("stanfordnlp/sst2")



In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1821
    })
})

In [7]:
dataset['train']

Dataset({
    features: ['idx', 'sentence', 'label'],
    num_rows: 67349
})

In [21]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [22]:
dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1821
    })
})

In [23]:
def preprocess_function(examples):
   return tokenizer(examples["sentence"], truncation=True)

tokenized_train = dataset['train'].map(preprocess_function, batched=True)
tokenized_val = dataset['validation'].map(preprocess_function, batched=True)

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [24]:
len(set(tokenized_train['label']))

2

In [25]:
len(set(tokenized_val['label']))

2

In [26]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [27]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}


In [29]:
from transformers import TrainerCallback

class LossCallback(TrainerCallback):
    def __init__(self):
        self.train_losses = []
        self.eval_losses = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            if "loss" in logs:
                self.train_losses.append(logs["loss"])
            if "eval_loss" in logs:
                self.eval_losses.append(logs["eval_loss"])


In [13]:
from transformers import TrainingArguments, Trainer

repo_name = "finetuning-sentiment-model-3000-samples"

training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
)
loss_callback = LossCallback()

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_val,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
    callbacks=[loss_callback]
)


2024-04-15 17:05:01.107678: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


NameError: name 'model' is not defined

In [19]:
trainer.train()

Step,Training Loss
500,0.1122
1000,0.1016
1500,0.107
2000,0.0843
2500,0.0987
3000,0.0988
3500,0.0986
4000,0.1113
4500,0.0776
5000,0.0493


TrainOutput(global_step=8420, training_loss=0.09321496968031495, metrics={'train_runtime': 431.7487, 'train_samples_per_second': 311.982, 'train_steps_per_second': 19.502, 'total_flos': 1227298029729216.0, 'train_loss': 0.09321496968031495, 'epoch': 2.0})

In [21]:
trainer.evaluate()

{'eval_loss': 0.37996798753738403,
 'eval_accuracy': 0.9139908256880734,
 'eval_f1': 0.9174917491749174,
 'eval_runtime': 1.5685,
 'eval_samples_per_second': 555.949,
 'eval_steps_per_second': 35.066,
 'epoch': 2.0}

In [23]:
# Evaluate the model
evaluation_results = trainer.evaluate()

# Access specific metrics
bert_eval_loss = evaluation_results["eval_loss"]
bert_accuracy = evaluation_results["eval_accuracy"]

# Print or use the metrics as needed
print(f"Evaluation Loss: {eval_loss}")
print(f"Accuracy: {accuracy}")

Evaluation Loss: 0.37996798753738403
Accuracy: 0.9139908256880734


In [24]:
from transformers import AutoModelForSequenceClassification
model_roberta = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [14]:
trainer_roberta = Trainer(
   model=model_roberta,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_val,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

NameError: name 'model_roberta' is not defined

In [26]:
trainer_roberta.train()

Step,Training Loss
500,0.6709
1000,0.5874
1500,0.545
2000,0.5097
2500,0.4623
3000,0.4306
3500,0.4148
4000,0.395
4500,0.3449
5000,0.2971


TrainOutput(global_step=8420, training_loss=0.38816607876231723, metrics={'train_runtime': 880.0944, 'train_samples_per_second': 153.049, 'train_steps_per_second': 9.567, 'total_flos': 2437721148367008.0, 'train_loss': 0.38816607876231723, 'epoch': 2.0})

In [27]:
# Evaluate the model
evaluation_results_roberta = trainer_roberta.evaluate()

# Access specific metrics
roberta_eval_loss = evaluation_results_roberta["eval_loss"]
roberta_accuracy = evaluation_results_roberta["eval_accuracy"]

# Print or use the metrics as needed
print(f"Evaluation Loss: {roberta_eval_loss}")
print(f"Accuracy: {roberta_accuracy}")

Evaluation Loss: 0.5847277045249939
Accuracy: 0.7981651376146789


In [15]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the T5 tokenizer and model
tokenizer_t5= T5Tokenizer.from_pretrained('google-t5/t5-small')
model_t5 = T5ForConditionalGeneration.from_pretrained('google-t5/t5-small', num_labels=2)

def preprocess_function(examples):
    return tokenizer_t5(examples["sentence"], padding="max_length", truncation=True)

tokenized_train_t5 = dataset['train'].map(preprocess_function, batched=True)
tokenized_val_t5 = dataset['validation'].map(preprocess_function, batched=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
tokenized_train_t5['input_ids']

[[7387,
  126,
  2829,
  2865,
  45,
  8,
  21555,
  3173,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
 

In [36]:
trainer_t5 = Trainer(
   model=model_t5,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_val,
   tokenizer=tokenizer_t5,
   data_collator=DataCollatorWithPadding(tokenizer=tokenizer_t5),
   compute_metrics=compute_metrics,
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [37]:
trainer_t5.train()

ValueError: not enough values to unpack (expected 2, got 1)

In [41]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch

# Load the SST-2 dataset
#dataset = load_dataset("glue", "sst2")

# Load the GPT-2 tokenizer and model
tokenizer_gpt = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)  # Binary classification (positive or negative sentiment)


repo_name = "finetuning-gpt2"

training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=1,
   per_device_eval_batch_size=1,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
)
loss_callback = LossCallback()

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_val,
   tokenizer=tokenizer_gpt,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
    callbacks=[loss_callback]
)


# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()

# Print the evaluation results
print(results)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss


KeyboardInterrupt: 

In [54]:
dataset['train']['sentence'][:3]

['hide new secretions from the parental units ',
 'contains no wit , only labored gags ',
 'that loves its characters and communicates something rather beautiful about human nature ']

In [42]:
naive_bayes_data = dataset

In [43]:
naive_bayes_data

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1821
    })
})

In [44]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
# Function to remove HTML tags and stopwords from text
def remove_tags_and_stopwords(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Tokenize the text
    words = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]

    # Join the filtered words back into a sentence
    processed_text = ' '.join(filtered_words)

    return processed_text

# Assuming dataset['train']['sentence'] is a list of sentences
naive_bayes_data_train = [remove_tags_and_stopwords(sentence) for sentence in naive_bayes_data['train']['sentence']]
naive_bayes_data_train_labels = naive_bayes_data['train']['label']
# Print a sample of preprocessed sentences
print(naive_bayes_data_train[:10])

[nltk_data] Downloading package punkt to /home/rathod.rak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rathod.rak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['hide new secretions parental units', 'contains wit , labored gags', 'loves characters communicates something rather beautiful human nature', 'remains utterly satisfied remain throughout', 'worst revenge-of-the-nerds clichés filmmakers could dredge', "'s far tragic merit superficial treatment", 'demonstrates director hollywood blockbusters patriot games still turn small , personal film emotional wallop .', 'saucy', "depressed fifteen-year-old 's suicidal poetry", "deeply thought ` right-thinking ' films"]


In [45]:
import pandas as pd
df = pd.DataFrame()
df['train_sent'] = naive_bayes_data_train
df['train_label'] = naive_bayes_data_train_labels
df_test = naive_bayes_data['validation']['sentence']
df_test_label = naive_bayes_data['validation']['label']

In [46]:
nltk.download('wordnet')
nltk.download('omw-1.4')

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st
df['train_sent'] = df.train_sent.apply(lemmatize_text)

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/rathod.rak/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/rathod.rak/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [47]:
from sklearn.preprocessing import LabelEncoder
reviews = df['train_sent'].values
labels = df['train_label'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

In [48]:
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(max_features = 3000)
X = vec.fit_transform(reviews)
vocab = vec.get_feature_names_out()
X = X.toarray()
word_counts = {}
for l in range(2):
    word_counts[l] = defaultdict(lambda: 0)
for i in range(X.shape[0]):
    l = labels[i]
    for j in range(len(vocab)):
        word_counts[l][vocab[j]] += X[i][j]

In [49]:
def laplace_smoothing(n_label_items, vocab, word_counts, word, text_label):
    a = word_counts[text_label][word] + 1
    b = n_label_items[text_label] + len(vocab)
    return math.log(a/b)

In [50]:
def group_by_label(x, y, labels):
    data = {}
    for l in labels:
        data[l] = x[np.where(y == l)]
    return data

def fit(x, y, labels):
    n_label_items = {}
    log_label_priors = {}
    n = len(x)
    grouped_data = group_by_label(x, y, labels)
    for l, data in grouped_data.items():
        n_label_items[l] = len(data)
        log_label_priors[l] = math.log(n_label_items[l] / n)
    return n_label_items, log_label_priors

In [51]:
def predict(n_label_items, vocab, word_counts, log_label_priors, labels, x):
    result = []
    for text in x:
        label_scores = {l: log_label_priors[l] for l in labels}
        words = set(w_tokenizer.tokenize(text))
        for word in words:
            if word not in vocab: continue
            for l in labels:
                log_w_given_l = laplace_smoothing(n_label_items, vocab, word_counts, word, l)
                label_scores[l] += log_w_given_l
        result.append(max(label_scores, key=label_scores.get))
    return result

In [52]:
import numpy as np
import math
from sklearn.metrics import accuracy_score

labels = [0,1]
n_label_items, log_label_priors = fit(reviews,encoded_labels,labels)
pred = predict(n_label_items, vocab, word_counts, log_label_priors, labels, df_test)
print("Accuracy of prediction on test set : ", accuracy_score(df_test_label,pred))
naive_bayes_test_accuracy = accuracy_score(df_test_label,pred)

Accuracy of prediction on test set :  0.7477064220183486


# LSTM

In [7]:
!pip install keras



In [8]:
!pip install tensorflow



In [30]:
type(X_train_pad)

numpy.ndarray

In [55]:
import tensorflow as tf
import numpy as np

In [56]:
from keras.models import Sequential
from keras.layers import Dense, Dropout,LSTM
from keras.optimizers import Adam
from keras.layers import Input
# Get the training and testing data
train_data = dataset["train"]
test_data = dataset["validation"]

# Extract sentences and labels
train_sentences = train_data["sentence"]
train_labels = train_data["label"]
test_sentences = test_data["sentence"]
test_labels = test_data["label"]

# Tokenize sentences
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train_sentences)

# Convert sentences to sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

# Pad sequences
max_length = 100  # Choose an appropriate max length
input_dim = 1  # Dimension of each time step in the sequence
max_length = 100  # Maximum sequence length

train_pad = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_length)
test_pad = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_length)
# Reshape padded sequences to match LSTM input shape
train_pad_reshaped = train_pad.reshape(train_pad.shape[0], max_length, input_dim)
test_pad_reshaped = test_pad.reshape(test_pad.shape[0], max_length, input_dim)

# Convert labels to numpy arrays
train_labels = np.array(train_labels).astype(int)
test_labels = np.array(test_labels).astype(int)

inputs = Input(shape=(train_pad.shape[1],))  # Specify sequence length

model = Sequential()

model.add(LSTM(units = 50, return_sequences = True,  input_shape=(train_pad_reshaped.shape[1], train_pad_reshaped.shape[2])))
model.add(Dropout(0.2))

model.add(LSTM(units = 50, return_sequences = True))
model.add(Dropout(0.2))

model.add(LSTM(units = 50 ))
model.add(Dropout(0.2))

model.add(Dense(units = 1))

model.compile(optimizer = Adam(learning_rate=1e-5), loss = 'mean_squared_error', metrics=['accuracy'])

  super().__init__(**kwargs)


In [57]:
print(train_pad_reshaped.shape, train_pad.shape[1], train_labels.shape)

(67349, 100, 1) 100 (67349,)


In [None]:
model.fit(train_pad, train_labels, epochs = 50, batch_size = 32)

Epoch 1/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 67ms/step - accuracy: 0.4679 - loss: 0.3381
Epoch 2/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 67ms/step - accuracy: 0.5463 - loss: 0.2512
Epoch 3/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 67ms/step - accuracy: 0.5507 - loss: 0.2496
Epoch 4/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 67ms/step - accuracy: 0.5539 - loss: 0.2486
Epoch 5/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 67ms/step - accuracy: 0.5532 - loss: 0.2478
Epoch 6/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 67ms/step - accuracy: 0.5569 - loss: 0.2475
Epoch 7/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 68ms/step - accuracy: 0.5618 - loss: 0.2470
Epoch 8/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 67ms/step - accuracy: 0.5627 - loss: 0.2460


In [28]:
y_pred = model.predict(test_pad_reshaped)

[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step


In [29]:
from sklearn.metrics import mean_squared_error
lstm_mse = np.sqrt(mean_squared_error(test_labels, y_pred))

In [30]:
lstm_mse

0.4912908508822874

In [31]:
scores = model.evaluate(test_pad_reshaped, test_labels)
accuracy = scores[1]

[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.5985 - loss: 0.2420


In [32]:
accuracy

0.5928899049758911