In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer

def preprocess_data(df):

  # Text Preprocessing
  # (Customize these steps as needed)
  df['Review Text'] = df['Review Text'].str.lower()  # Convert to lowercase
  df['Review Text'] = df['Review Text'].str.replace(r'[^\w\s]', '', regex=True)  # Remove punctuation
  df['Review Text'] = df['Review Text'].str.replace(r'\s+', ' ', regex=True)  # Remove extra spaces

  columns_to_retain=['Review Text', 'Sentiment']
  df = df.drop(columns=df.columns.difference(columns_to_retain), axis=1)
  return df

In [None]:
df = pd.read_csv('//content/undersampled.csv')

processed_data1 = preprocess_data(df.copy())

In [None]:
import pandas as pd
from nltk.stem import WordNetLemmatizer

# Assuming lemmatizer is already defined
lemmatizer = WordNetLemmatizer()

def preprocess_dataframe(df):
    # Handle missing values
    df['Review Text'].fillna("", inplace=True)

    # Apply lemmatization
    df['Review Text'] = df['Review Text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

    # Select only the required columns
    df = df[['Review Text', 'Sentiment']]

    return df


In [None]:
processed_data1=preprocess_dataframe(processed_data1)


In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm

# Load dataset into pandas DataFrame (replace this with your own dataset loading)
# Assuming the dataset has 'text' and 'label' columns


# Split dataset into training and testing sets
train_df, test_df = train_test_split(processed_data1, test_size=0.2, random_state=42)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input data
def tokenize_data(data, max_length):
    input_ids = []
    attention_masks = []

    for text in data["Review Text"]:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            truncation=True,
                            return_attention_mask=True,
                            return_tensors='pt'
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(data["Sentiment"].tolist())

    return input_ids, attention_masks, labels

# Tokenize training and testing data
max_length = 128
train_input_ids, train_attention_masks, train_labels = tokenize_data(train_df, max_length)
test_input_ids, test_attention_masks, test_labels = tokenize_data(test_df, max_length)

# Create DataLoader for training and testing data
batch_size = 32

train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2, # Assuming binary classification (change it accordingly)
    output_attentions=False,
    output_hidden_states=False
)

# Set device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set hyperparameters
epochs = 15
total_steps = len(train_dataloader) * epochs
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc="Epoch {}".format(epoch+1)):
        batch_input_ids = batch[0].to(device)
        batch_attention_masks = batch[1].to(device)
        batch_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(
            input_ids=batch_input_ids,
            attention_mask=batch_attention_masks,
            labels=batch_labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print("Average training loss: {:.4f}".format(avg_train_loss))

# Evaluation loop
model.eval()
total_accuracy = 0
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        batch_input_ids = batch[0].to(device)
        batch_attention_masks = batch[1].to(device)
        batch_labels = batch[2].to(device)

        outputs = model(
            input_ids=batch_input_ids,
            attention_mask=batch_attention_masks,
            labels=batch_labels
        )

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        total_accuracy += torch.sum(preds == batch_labels).item()

accuracy = total_accuracy / len(test_df)
print("Accuracy on test set: {:.2f}%".format(accuracy * 100))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1:  33%|███▎      | 87/264 [59:08<1:58:02, 40.02s/it]