In [1]:
import pandas as pd

# Replace 'path_to_file' with the actual path to your file
df = pd.read_csv('mini_sample.xls')

df = df[['reviewText', 'overall']]
df['overall'] = df['overall'] - 1  # Adjust labels to start from 0


print(df)

                                             reviewText  overall
0     This is one my must have books. It is a master...      4.0
1     This book provides a reflection that you can a...      4.0
2     I first read THE PROPHET in college back in th...      4.0
3     A timeless classic.  It is a very demanding an...      4.0
4     Reading this made my mind feel like a still po...      4.0
...                                                 ...      ...
1021  To make my point I'll be as concise as possibl...      0.0
1022  This is a book that I can't put down, until 2 ...      3.0
1023  I started this book with the idea that it woul...      3.0
1024  This novel was beautifully written.  Amy Tan o...      3.0
1025  The Professor and the Madman deals with the ro...      3.0

[1026 rows x 2 columns]


In [2]:
df['overall'] = df['overall'].astype(int)
# Using the dtype attribute
overall_dtype = df['overall'].dtype
print("Data type of 'overall' column:", overall_dtype)

Data type of 'overall' column: int64


In [3]:
# Specify the path where you want to save the CSV file
file_path = "MINI_SAMPLE.csv"

# Save the DataFrame to a CSV file
df.to_csv(file_path, index=False)  # Set index=False to exclude row indices in the CSV

print(f"DataFrame saved to {file_path}")

DataFrame saved to MINI_SAMPLE.csv


In [4]:
!pip install datasets




In [5]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="MINI_SAMPLE.csv")

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['reviewText', 'overall'],
        num_rows: 1026
    })
})

In [7]:
from sklearn.model_selection import train_test_split
from datasets import DatasetDict
from datasets import Dataset, DatasetDict


# Assuming 'dataset' is your DatasetDict object containing the training split
train_dataset = dataset['train']

# Split the training dataset into train and test sets
train_data, val_data = train_test_split(train_dataset, test_size=0.2, random_state=42)

# Convert the splits to Dataset objects
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)

# Now you have train_dataset and test_dataset containing the training and test data

# Create a new DatasetDict object to hold the splits
split_dataset = DatasetDict({'train': train_dataset, 'validation': val_dataset})


In [8]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['reviewText', 'overall'],
        num_rows: 820
    })
    validation: Dataset({
        features: ['reviewText', 'overall'],
        num_rows: 206
    })
})

In [9]:
!pip install transformers[torch]



In [10]:
from transformers import AutoTokenizer
# Assuming you have a DatasetDict object named dataset_dict
# and you want to apply the map function to the 'train' dataset


tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["reviewText"], padding="max_length", truncation=True)

# Now you can use the .map() method on the train_dataset
tokenized_datasets = split_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/820 [00:00<?, ? examples/s]

Map:   0%|          | 0/206 [00:00<?, ? examples/s]

In [11]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['reviewText', 'overall', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 820
    })
    validation: Dataset({
        features: ['reviewText', 'overall', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 206
    })
})

In [12]:
tokenized_datasets = tokenized_datasets.remove_columns(["reviewText"])
tokenized_datasets = tokenized_datasets.rename_column("overall", "labels")
tokenized_datasets.set_format("torch")

In [13]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 820
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 206
    })
})

In [14]:
small_train_dataset = tokenized_datasets["train"]
small_eval_dataset = tokenized_datasets["validation"]

In [15]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=3)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=3)

In [16]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [18]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [19]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [20]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/822 [00:00<?, ?it/s]

In [21]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.7281553398058253}