In [None]:
# TODO insert validation into training. Stop training as soon as validation test score starts to decrease
# k-fold cross validation?

In [2]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification
from torch.optim import AdamW 
import os

In [4]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Read in the data from train.txt
with open("train.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

with open("test_just_reviews.txt", "r", encoding="utf-8") as f:
    lines_test = f.readlines()

In [3]:
# Define functions to handle the labels
def process(label): # labels can be "TRUTHFULPOSITIVE", "DECEPTIVEPOSITIVE",  "TRUTHFULNEGATIVE" or "DECEPTIVENEGATIVE"
    if label == "TRUTHFULPOSITIVE":
        return 0
    elif label == "DECEPTIVEPOSITIVE":
        return 1
    elif label == "TRUTHFULNEGATIVE":
        return 2
    elif label == "DECEPTIVENEGATIVE":
        return 3
    else:
        print("Error: label not found")
        return -1

def convert(label_int):
    if label_int == 0:
        return "TRUTHFULPOSITIVE"
    elif label_int == 1:
        return "DECEPTIVEPOSITIVE"
    elif label_int == 2:
        return "TRUTHFULNEGATIVE"
    elif label_int == 3:
        return "DECEPTIVENEGATIVE"
    else:
        print("Error: label not found")
        return -1

In [52]:
# Split the lines into labels and texts
train_labels = []
train_texts = []
for line in lines:
    label, text = line.strip().split("\t")
    train_labels.append(process(label))
    train_texts.append(text)

test_texts = []
for line in lines_test:
    text = line.strip()
    test_texts.append(text)

# train test split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts[:500], train_labels[:500], test_size=.2)

In [53]:
import random
i = random.randint(0, len(train_texts)-1)
print(len(train_texts))
print(convert(train_labels[i]))
print(train_texts[i])

400
DECEPTIVEPOSITIVE
 If you are traveling to Chicago and need a place to stay, I highly recommend the Hotel Allegro Chicago. It was visiting time visiting Chicago for the first time on a business trip and a friend referred me to the Hotel Allegro. He said they were pet friendly,which was a huge plus. This is because I hate leaving my kitten muffin with a pet sitter every time I travel for business. Upon my arrival, I was met with friendly valet who treated me like I was the only customer who mattered. I walked in and saw a hotel like no other I have seen before or after. The lobby was decorated with modern furniture and paintings and looked like a place I could relax in. After I had entered and checked in, I saw the staircase, which looked like a testament to the staircase in the "Titanic". Frankly, I was amazed and I had not even entered my room yet. My room was a Queen Deluxe room and it was amazing. The bed had a blue headboard,which I have never seen before, and the colors worked

In [54]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [55]:
# print the encoding for the first training example
print(train_encodings.keys())
print(train_encodings['input_ids'][3])
print(train_encodings['attention_mask'][3])


dict_keys(['input_ids', 'attention_mask'])
[101, 2004, 1037, 6976, 20174, 2000, 3190, 1010, 1045, 3227, 2994, 2012, 1996, 11175, 2457, 1010, 2021, 2245, 1045, 2052, 3046, 1037, 2367, 24611, 3309, 1012, 1045, 2097, 2196, 2994, 2012, 2023, 3309, 2153, 1012, 1996, 2034, 2282, 2027, 2716, 2033, 2000, 1011, 1011, 2037, 3115, 2282, 1011, 1011, 2001, 1037, 1043, 10626, 7810, 9346, 1010, 2975, 2033, 7078, 19118, 13181, 20200, 999, 2941, 1996, 10479, 3309, 2282, 1045, 1005, 2310, 2042, 1999, 1999, 1996, 1057, 1012, 1055, 999, 1045, 10865, 2000, 1996, 2392, 4624, 1010, 2040, 2059, 2333, 2033, 2000, 1037, 1000, 12882, 1000, 2282, 1011, 1011, 2145, 3243, 2235, 1010, 2021, 6133, 8231, 2061, 1012, 1996, 5723, 2001, 4714, 1010, 2302, 1037, 18736, 1010, 2437, 2009, 3697, 2000, 2404, 2115, 11848, 5134, 1999, 1996, 2282, 1012, 1996, 3295, 3727, 2172, 2000, 2022, 9059, 1010, 2004, 2026, 2282, 2246, 2041, 3495, 3031, 1996, 3449, 1006, 8319, 10798, 1011, 4281, 5005, 1007, 1010, 1998, 1996, 2395, 2001, 3492

In [57]:
#  create a dataset class for the hotel reviews which inherits from torch Dataset

class HotelReviewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
    
train_dataset = HotelReviewsDataset(train_encodings, train_labels)
val_dataset = HotelReviewsDataset(val_encodings, val_labels) # dataset for validation

In [58]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [59]:
# Get the number of labels in the dataset
num_labels = len(set(train_labels))

# Modify the model to match the number of labels in the dataset
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)
model.to(device)
model.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [60]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

In [69]:
import torch.nn.functional as F

val_loader = DataLoader(val_dataset, batch_size=16)

def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

def validation(validation_dataloader):
  with torch.no_grad():
    loss_val_list = []
    preds_list = []
    accuracy_list = []
    accuracy_sum = 0
    for batch in tqdm(validation_dataloader):
      print(batch.keys())
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
      loss = outputs[0]
      logits = F.softmax(outputs[1], dim=1)   # Taking the softmax of output
      _,preds = torch.max(logits, dim=1)      # Taking the predictions of our batch
      acc = accuracy(logits,labels)           # Calculating the accuracy of current batch
      accuracy_sum += acc                     # Taking sum of all the accuracies of all the batches. This sum will be divided by batch length to get mean accuracy for validation dataset

      loss_val_list.append(loss)
      preds_list.append(preds)
      accuracy_list.append(acc)

      # for the wrong predictions, print the text
      #for i in range(len(labels)):
      #  if labels[i] != preds[i]:
      #    print("Ground truth:" , convert(labels[i]))
      #    print("Prediction:", convert(preds[i]))
      #    print(val_texts[i])
      #    print("----------")

  mean_accuracy = accuracy_sum / len(validation_dataloader)
  return mean_accuracy

In [61]:
for epoch in range(15):
    loss_epoch = 0
    for batch in tqdm(train_loader):
        optim.zero_grad() # zero out the gradients
        input_ids = batch['input_ids'].to(device) # move the batch to the device
        attention_mask = batch['attention_mask'].to(device) # move the batch to the device
        labels = batch['labels'].to(device) # move the batch to the device
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels) # forward pass
        loss = outputs[0] # get the loss
        loss_epoch += loss
        loss.backward() # backward pass
        optim.step() # update the parameters
        optim.zero_grad() # zero out the gradients
    loss_epoch /= len(train_loader)
    print("Epoch: {} Loss: {}".format(epoch, loss_epoch))
    print("Validation accuracy:", validation(val_loader))
model.eval()

100%|██████████| 25/25 [06:43<00:00, 16.15s/it]
100%|██████████| 25/25 [07:03<00:00, 16.93s/it]
100%|██████████| 25/25 [06:39<00:00, 15.99s/it]


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [62]:
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = './model_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`. 
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))


Saving model to ./model_save/


('./model_save/tokenizer_config.json',
 './model_save/special_tokens_map.json',
 './model_save/vocab.txt',
 './model_save/added_tokens.json',
 './model_save/tokenizer.json')

In [70]:
validation(val_loader)

  0%|          | 0/7 [00:00<?, ?it/s]

dict_keys(['input_ids', 'attention_mask', 'labels'])


 14%|█▍        | 1/7 [00:06<00:39,  6.66s/it]

Ground truth: TRUTHFULPOSITIVE
Prediction: DECEPTIVEPOSITIVE
This was a weekend trip for my friends and I. We booked a couple of rooms and asked in advanced for the reservations to be connected in some way. Our rooms were right next to one another as requested. As it was almost everyone's first time to Chicago, we had to do a bit of cold-searching for a hotel. And, our choice was next to perfect. I had an amazing time during my first visit to Chicago and it was largely due to this hotel. The customer service was AMAZING. Everyone was extremely nice and helpful. One of our rooms ordered late night room service and raved about how they felt they had ordered a 5 star lunch because it was as fresh as having ordered food at 2 pm, and delicious. My friend suggested they had a full-staff at 3 am because the food came up quickly too. In another of our rooms, housekeeping had accidentally discarded a CD when replacing the CD player with an iPod dock. The concierge had the exact CD purchased and

 29%|██▊       | 2/7 [00:12<00:31,  6.31s/it]

Ground truth: TRUTHFULPOSITIVE
Prediction: TRUTHFULNEGATIVE
This hotel is the best hotel ever in my opinion, and I really enjoy everything thing about it and I also have many different reasons why I like this hotel. From the entrance to every details on the inside all the way up to the rooms, everything seems to pocess some kind of style to it and not forgetting such a modern feel it has. I completely am astonished by this building and if I had to pick one to compare with this one I really would go for this one. It's like a person would have to see it to believe. It has a lot of modern designs. Thank you and this is my review and it's real.
----------
Ground truth: TRUTHFULPOSITIVE
Prediction: DECEPTIVEPOSITIVE
After booking a room over two months in advance at the Hotel Sofitel Chicago Watertower and confirming my reservation twice, I arrived to find that the front desk staff had 'no record' of my reservation and that the only rooms available were much more expensive than the room tha

 43%|████▎     | 3/7 [00:18<00:25,  6.26s/it]

Ground truth: TRUTHFULPOSITIVE
Prediction: TRUTHFULNEGATIVE
This hotel is the best hotel ever in my opinion, and I really enjoy everything thing about it and I also have many different reasons why I like this hotel. From the entrance to every details on the inside all the way up to the rooms, everything seems to pocess some kind of style to it and not forgetting such a modern feel it has. I completely am astonished by this building and if I had to pick one to compare with this one I really would go for this one. It's like a person would have to see it to believe. It has a lot of modern designs. Thank you and this is my review and it's real.
----------
Ground truth: TRUTHFULNEGATIVE
Prediction: DECEPTIVENEGATIVE
While the hotel certainly seems to look beautiful, the hotel is actually far from it. Even booking a room online was rather difficult, and i wasn't able to reach a representative upon calling any of the contact numbers. In general coming here is a bad decision despite how it loo

 57%|█████▋    | 4/7 [00:25<00:19,  6.53s/it]

Ground truth: DECEPTIVENEGATIVE
Prediction: TRUTHFULNEGATIVE
Thanks Sheraton Towers for the invite to enjoy your indoor pool while a guest at your hotel recently. I did not know that the skyline of Chicago could be so beautiful or an afternoon at the pool so enjoyable. I'll be back soon and look forward to a seeing Chicago again.
----------
Ground truth: TRUTHFULNEGATIVE
Prediction: TRUTHFULPOSITIVE
While the hotel certainly seems to look beautiful, the hotel is actually far from it. Even booking a room online was rather difficult, and i wasn't able to reach a representative upon calling any of the contact numbers. In general coming here is a bad decision despite how it looks, its a mistake i wont make again and you shouldn't either.
----------
Ground truth: TRUTHFULNEGATIVE
Prediction: DECEPTIVENEGATIVE
The Palmer House Hilton, while it looks good in pictures, and the outside, is actually a disaster of a hotel. When I went through, the lobby was dirty, my room hadn't been cleaned, and

 71%|███████▏  | 5/7 [00:32<00:12,  6.50s/it]

Ground truth: TRUTHFULPOSITIVE
Prediction: DECEPTIVEPOSITIVE
Thanks Sheraton Towers for the invite to enjoy your indoor pool while a guest at your hotel recently. I did not know that the skyline of Chicago could be so beautiful or an afternoon at the pool so enjoyable. I'll be back soon and look forward to a seeing Chicago again.
----------
Ground truth: TRUTHFULNEGATIVE
Prediction: DECEPTIVENEGATIVE
My husband and I stayed here at the Hard Rock Hotel in Chicago a few months back. I wasn't particularly impressed with their customer service. When we arrived to check in, the front desk clerk was quite rude and unfriendly. She was short with me when I asked her about the city's attractions and things to do. Our room wasn't all that great either. It was rather small and had a weird smell to it. I won't be staying here again.
----------
dict_keys(['input_ids', 'attention_mask', 'labels'])


 86%|████████▌ | 6/7 [00:39<00:06,  6.64s/it]

Ground truth: TRUTHFULNEGATIVE
Prediction: DECEPTIVENEGATIVE
My husband and I stayed here at the Hard Rock Hotel in Chicago a few months back. I wasn't particularly impressed with their customer service. When we arrived to check in, the front desk clerk was quite rude and unfriendly. She was short with me when I asked her about the city's attractions and things to do. Our room wasn't all that great either. It was rather small and had a weird smell to it. I won't be staying here again.
----------
Ground truth: TRUTHFULPOSITIVE
Prediction: DECEPTIVEPOSITIVE
The Ambassador Hotel is located in Downtown Chicago right off Lakeshore Drive in the heart of the hotel industry downtown. I would recommend you look to one of those other hotels if you are wanting a place to stay downtown. The rates at this hotel indicate that it would be a top of the line place to stay, when in fact, we had a pretty unpleasant experience. It started with someone who was obviously a new employee checking us in. I hav

100%|██████████| 7/7 [00:41<00:00,  5.87s/it]

Ground truth: TRUTHFULPOSITIVE
Prediction: TRUTHFULNEGATIVE
This hotel is the best hotel ever in my opinion, and I really enjoy everything thing about it and I also have many different reasons why I like this hotel. From the entrance to every details on the inside all the way up to the rooms, everything seems to pocess some kind of style to it and not forgetting such a modern feel it has. I completely am astonished by this building and if I had to pick one to compare with this one I really would go for this one. It's like a person would have to see it to believe. It has a lot of modern designs. Thank you and this is my review and it's real.
----------
Ground truth: DECEPTIVEPOSITIVE
Prediction: TRUTHFULPOSITIVE
Thanks Sheraton Towers for the invite to enjoy your indoor pool while a guest at your hotel recently. I did not know that the skyline of Chicago could be so beautiful or an afternoon at the pool so enjoyable. I'll be back soon and look forward to a seeing Chicago again.
--------




tensor(0.7679)