## Sentiment Analysis Logistic Regression using DistilBERT

In [1]:
import pandas as pd
import numpy as np

In [2]:
!pip install transformers
!pip install tqdm
!pip install sentence_transformers
!pip install datasets

Collecting sentence_transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m65.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m67.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14

In [3]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sentence_transformers import SentenceTransformer

# Load the distilbert model trained for sentence embeddings
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.02k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# lets use GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [6]:
raw_hotel_reviews = pd.read_csv('/content/drive/MyDrive/SeminarML/hotel-reviews.csv')

In [7]:
raw_hotel_reviews = pd.read_csv('/content/drive/MyDrive/SeminarML/hotel-reviews.csv')

def dataset_prepare(dataset):
  dataset = dataset.copy()
  columns_to_drop = ['User_ID', 'Browser_Used', 'Device_Used']
  dataset = dataset.drop(columns=columns_to_drop, axis=1)
  dataset['Is_Response'].replace({'happy': 1, 'not happy': 0}, inplace=True)
  dataset = dataset.rename(columns={'Is_Response': 'label'})
  dataset = dataset.rename(columns={'Description': 'review'})

  return dataset

prepared_hotel_reviews = dataset_prepare(raw_hotel_reviews)

prepared_hotel_reviews.head()

Unnamed: 0,review,label
0,The room was kind of clean but had a VERY stro...,0
1,I stayed at the Crown Plaza April -- - April -...,0
2,I booked this hotel through Hotwire at the low...,0
3,Stayed here with husband and sons on the way t...,1
4,My girlfriends and I stayed here to celebrate ...,0


In [11]:
# Load Distilbert and move to GPU
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)

def sentences_to_embeddings(sentences, tokenizer, model, device):
    # Tokenize a batch of sentences and prepare the tensors
    inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state
    mean_embeddings = torch.mean(embeddings, dim=1)
    return mean_embeddings.cpu()  # Move embeddings back to CPU if necessary

reviews = prepared_hotel_reviews['review'].tolist()

# we extract the embeddings in batches to avoid memory issues
chunk_size = 100  # Adjust based on GPU memory
embeddings = []
for i in range(0, len(prepared_hotel_reviews), chunk_size):
    batch = prepared_hotel_reviews['review'][i:i + chunk_size].tolist()
    batch_embeddings = sentences_to_embeddings(batch, tokenizer, model, device)
    embeddings.extend(batch_embeddings)


# Convert embeddings to numpy for easy handling
embeddings = [embedding.detach().numpy() for embedding in embeddings]

# convert all embeddings to a 2D array that is our input
X= np.vstack(embeddings)

# our class labels which is the output of the classification task (ground truth)
Y = prepared_hotel_reviews['label'].tolist()

In [12]:
# to split our dataset into training and test sets
from sklearn.model_selection import train_test_split

#import the logistic regression from sklearn
from sklearn.linear_model import LogisticRegression

# import evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train a logistic regression classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the classifier
print("Accuracy: %.3f" %(accuracy_score(y_test, y_pred)))
print("Precision: %.3f" %(precision_score(y_test, y_pred)))
print("Recall: %.3f" %(recall_score(y_test, y_pred)))
print("F1-Score: %.3f" %(f1_score(y_test, y_pred)))

Accuracy: 0.880
Precision: 0.898
Recall: 0.930
F1-Score: 0.913
