In [None]:
import torch
import re
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import random
from torch.utils.data import DataLoader, random_split
from transformers import BertForSequenceClassification, BertTokenizer, AdamW

In [None]:
# upladed 2 archives to colab env from
# https://www.google.com/url?q=http://www.cs.cornell.edu/people/pabo/movie-review-data/&sa=D&source=docs&ust=1706778831826672&usg=AOvVaw1AiWreoABbjsg2-WY0Adgi
# unzip them:

In [None]:
! tar -xvzf /content/scale_data.tar.gz

scaledata.README.1.0.txt
scaledata/Dennis+Schwartz/
scaledata/Dennis+Schwartz/subj.Dennis+Schwartz
scaledata/Dennis+Schwartz/id.Dennis+Schwartz
scaledata/Dennis+Schwartz/rating.Dennis+Schwartz
scaledata/Dennis+Schwartz/label.3class.Dennis+Schwartz
scaledata/Dennis+Schwartz/label.4class.Dennis+Schwartz
scaledata/James+Berardinelli/
scaledata/James+Berardinelli/subj.James+Berardinelli
scaledata/James+Berardinelli/id.James+Berardinelli
scaledata/James+Berardinelli/rating.James+Berardinelli
scaledata/James+Berardinelli/label.3class.James+Berardinelli
scaledata/James+Berardinelli/label.4class.James+Berardinelli
scaledata/Scott+Renshaw/
scaledata/Scott+Renshaw/subj.Scott+Renshaw
scaledata/Scott+Renshaw/id.Scott+Renshaw
scaledata/Scott+Renshaw/rating.Scott+Renshaw
scaledata/Scott+Renshaw/label.3class.Scott+Renshaw
scaledata/Scott+Renshaw/label.4class.Scott+Renshaw
scaledata/Steve+Rhodes/
scaledata/Steve+Rhodes/subj.Steve+Rhodes
scaledata/Steve+Rhodes/id.Steve+Rhodes
scaledata/Steve+Rhodes/rat

In [None]:
! tar -xvzf /content/scale_whole_review.tar.gz

### Reading data block

In [None]:
from os import listdir

# load scale data into memory
def load_doc(filename):
   # open the file as read only
   file = open(filename, 'r')
   # read all text
   text = file.read().splitlines()
   # close the file
   file.close()
   return text

# load full reviews into memory
def load_review(filename):
   # open the file as read only
   file = open(filename, 'r', errors='ignore')
   # read all text
   text = file.read()
   # close the file
   file.close()
   return text

def process_docs(directory):
  df = pd.DataFrame()
  # walk through all files in the folder
  for filename in listdir(directory):
    # create the full path of the file to open
    path = directory + '/' + filename
    # load document
    doc = load_doc(path)
    #print(doc)
    df[filename.split('.')[0]] = doc

  df['reviewer'] = filename.split('.')[1]

  return df

def add_reviews_text(directory, reviewer):
  path = directory + reviewer
  df = pd.DataFrame()
  directory = path + '/txt.parag'
  # walk through all files in the folder
  for filename in listdir(directory):
    tmp_df = pd.DataFrame()
    # create the full path of the file to open
    path = directory + '/' + filename
    # load document
    doc = load_review(path)

    tmp_df = pd.DataFrame({'id': filename.split('.')[0], 'review_text': doc}, index = [0])

    df = pd.concat([df,tmp_df], axis = 0)

  return df


### Text processing

In [None]:
def text_preprocessing(s):
    """
    - Remove "@name"
    - Remove other special characters
    - Remove trailing whitespace and \n \t
    """
    # Remove @name
    s = re.sub(r'(@.*?)[\s]', ' ', s)

    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Replace '&amp;' with '&'
    s = re.sub(r'&amp;', '&', s)
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()

    s = re.sub(r"http\S+", "", s)

    for symb in ["!", ",", ":", ";", "?"]:
      s = re.sub(rf"\{symb}\.", symb, s)

    s = re.sub(r"#\S+", "", s)
    s = s.strip()

    return s

In [None]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")


In [None]:
reviewers = reviewers = listdir('/content/scaledata')
reviewers

['Steve+Rhodes', 'Dennis+Schwartz', 'James+Berardinelli', 'Scott+Renshaw']

In [None]:
# combine data to one dataframe
scale_data = pd.DataFrame()
scale_whole_review = pd.DataFrame()

for each_reviewer in tqdm(reviewers):
  files_for_each_reviewer = '/content/scaledata/' + each_reviewer

  df = process_docs(files_for_each_reviewer)
  scale_data = pd.concat([scale_data, df], axis = 0)

  # read reviews text
  df_review = add_reviews_text(directory ='/content/scale_whole_review/', reviewer = each_reviewer)

  scale_whole_review = pd.concat([scale_whole_review, df_review], axis = 0)


100%|██████████| 4/4 [00:04<00:00,  1.01s/it]


In [None]:
data = scale_data.merge(scale_whole_review, on = 'id', how = 'left')

In [None]:
# convert rating to float as we'll solve regression task
data['rating'] = data['rating'].astype(float)

In [None]:
data.head(), data.shape, data.dtypes

(  label                                               subj     id  rating  \
 0     0  this bit of lame physical humor is typical of ...  11790     0.1   
 1     0  some comedies are not funny no matter how hard...  17628     0.1   
 2     0  you'll check your watch frequently waiting for...  18971     0.1   
 3     0  freddy got fingered , written and directed by ...  28767     0.1   
 4     0  it is a mess of a movie . if you are so advent...   3579     0.1   
 
        reviewer                                        review_text  
 0  Steve+Rhodes  As the twin surfer dudes, Stew and Phil Deedle...  
 1  Steve+Rhodes  Some comedies are not funny no matter how hard...  
 2  Steve+Rhodes  You'll check your watch frequently waiting for...  
 3  Steve+Rhodes  FREDDY GOT FINGERED, written and directed by i...  
 4  Steve+Rhodes  COLOR OF NIGHT is an instant candidate for all...  ,
 (5006, 6),
 label           object
 subj            object
 id              object
 rating         float64
 

In [None]:
# for training
data['label'] = data['rating']
data['text'] = data['subj']

data['text'] = data['text'].apply(text_preprocessing)

In [None]:
data.head()

Unnamed: 0,label,subj,id,rating,reviewer,review_text,text
0,0.1,this bit of lame physical humor is typical of ...,11790,0.1,Steve+Rhodes,"As the twin surfer dudes, Stew and Phil Deedle...",this bit of lame physical humor is typical of ...
1,0.1,some comedies are not funny no matter how hard...,17628,0.1,Steve+Rhodes,Some comedies are not funny no matter how hard...,some comedies are not funny no matter how hard...
2,0.1,you'll check your watch frequently waiting for...,18971,0.1,Steve+Rhodes,You'll check your watch frequently waiting for...,you'll check your watch frequently waiting for...
3,0.1,"freddy got fingered , written and directed by ...",28767,0.1,Steve+Rhodes,"FREDDY GOT FINGERED, written and directed by i...","freddy got fingered , written and directed by ..."
4,0.1,it is a mess of a movie . if you are so advent...,3579,0.1,Steve+Rhodes,COLOR OF NIGHT is an instant candidate for all...,it is a mess of a movie . if you are so advent...


In [None]:
data.to_csv('train_data.csv')

In [None]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, BertTokenizer, AdamW

set_seed(555)

class MovieReviewDataset(Dataset):
    def __init__(self, data_path, tokenizer, max_seq_length=512):
        # select smaller sample to make fune-tuning fast
        self.data = pd.read_csv(data_path,  usecols =['label', 'text']).sample(1000)
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        label = torch.tensor(self.data.iloc[idx]['label'], dtype=torch.float)
        text = self.data.iloc[idx]['text']

        tokens = self.tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_seq_length)
        input_ids = tokens['input_ids'].squeeze()
        attention_mask = tokens['attention_mask'].squeeze()

        return input_ids, attention_mask, label

class SentimentAnalysisBertFineTuner:
    def __init__(self, model_name, data_path, max_seq_length=512, window_size=512, overlap=32, learning_rate=2e-5):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1)
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model.to(self.device)
        self.train_dataset = MovieReviewDataset(data_path, self.tokenizer, max_seq_length=max_seq_length)
        self.max_seq_length = max_seq_length
        self.window_size = window_size
        self.overlap = overlap
        self.learning_rate = learning_rate

    def tokenize(self, text):
        tokens_list = []
        input_ids_list = []
        attention_mask_list = []

        start_idx = 0
        while start_idx < len(text):
            end_idx = start_idx + self.window_size
            window_text = text[start_idx:end_idx]

            tokens = self.tokenizer(window_text, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_seq_length)

            tokens_list.append(tokens)
            input_ids_list.append(tokens['input_ids'].squeeze().to(self.device))
            attention_mask_list.append(tokens['attention_mask'].squeeze().to(self.device))

            start_idx += self.window_size - self.overlap

        return tokens_list, input_ids_list, attention_mask_list

    def custom_loss_function(self, predictions, targets):
        each_example_loss = torch.nn.MSELoss()(predictions.view(-1), targets.view(-1))
        return each_example_loss

    def fine_tune(self, epochs=3, validation_split=0.2):
        # Split data into training and validation sets
        total_size = len(self.train_dataset)
        train_size = int((1.0 - validation_split) * total_size)
        val_size = total_size - train_size
        train_dataset, val_dataset = random_split(self.train_dataset, [train_size, val_size])

        optimizer = AdamW(self.model.parameters(), lr=self.learning_rate)
        train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
        val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

        for epoch in range(epochs):
            self.model.train()

            epoch_loss = 0.0
            num_windows = 0

            for input_ids, attention_mask, label in train_dataloader:
                input_ids = input_ids.to(self.device)
                attention_mask = attention_mask.to(self.device)
                label = label.to(self.device)

                for i in range(len(input_ids)):
                    predictions = self.model(input_ids[i].unsqueeze(0), attention_mask=attention_mask[i].unsqueeze(0)).logits
                    each_example_loss = self.custom_loss_function(predictions, label[i].unsqueeze(0))
                    epoch_loss += each_example_loss.item()
                    num_windows += 1

                mean_loss = epoch_loss / num_windows

                optimizer.zero_grad()

                mean_loss_tensor = torch.tensor(mean_loss, requires_grad=True, device=self.device)
                mean_loss_tensor.backward()

                optimizer.step()

            # Validation
            val_loss = 0.0
            num_val_windows = 0
            val_predictions_list = []
            val_labels_list = []

            with torch.no_grad():

              for val_input_ids, val_attention_mask, val_label in val_dataloader:
                    val_input_ids = val_input_ids.to(self.device)
                    val_attention_mask = val_attention_mask.to(self.device)
                    val_label = val_label.to(self.device)

                    for i in range(len(val_input_ids)):

                      val_predictions = self.model(val_input_ids[i].unsqueeze(0), attention_mask=val_attention_mask[i].unsqueeze(0)).logits
                      val_each_example_loss = self.custom_loss_function(val_predictions, val_label[i].unsqueeze(0))
                      val_loss += val_each_example_loss.item()
                      num_val_windows += 1

                      val_predictions_list.append(val_predictions.cpu().numpy())
                      val_labels_list.append(val_label[i].item())  # Use item() to get the scalar value

                    val_mean_loss = val_loss / num_val_windows
                    val_predictions_array = torch.tensor(val_predictions_list).flatten()
                    val_labels_array = torch.tensor(val_labels_list)

              val_mse = torch.nn.MSELoss()(val_predictions_array, val_labels_array)

              print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {mean_loss}, Validation MSE: {val_mse}")

              # Reset validation loss variables for the next epoch
              val_loss = 0.0
              num_val_windows = 0


    def predict(self, text):
        self.model.eval()

        tokens_list, input_ids_list, attention_mask_list = self.tokenize(text)

        all_predictions = []
        for i in range(len(tokens_list)):
            predictions = self.model(input_ids_list[i].unsqueeze(0), attention_mask=attention_mask_list[i].unsqueeze(0)).logits
            all_predictions.append(predictions.item())

        mean_prediction = sum(all_predictions) / len(all_predictions)

        return mean_prediction

    def save_model(self, save_path):
        self.model.save_pretrained(save_path)
        self.tokenizer.save_pretrained(save_path)



# Example usage:
# Specify the paths to the BERT model, sentiment dataset, and choose hyperparameters
model_name = "bert-base-uncased"
data_path = "/content/train_data.csv"
max_seq_length = 512
window_size = 512
overlap = 32
learning_rate = 3e-5

# Instantiate the SentimentAnalysisBertFineTuner
sentiment_tuner = SentimentAnalysisBertFineTuner(model_name, data_path, max_seq_length, window_size, overlap, learning_rate)

# Fine-tune the model
sentiment_tuner.fine_tune(epochs = 3, validation_split = 0.1)

# Save the fine-tuned model
sentiment_tuner.save_model("fine_tuned_model")


Random seed set as 555


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Train Loss: 0.14892100065551206, Validation MSE: 0.14338573813438416
Epoch 2/3, Train Loss: 0.14787406581520954, Validation MSE: 0.13723987340927124
Epoch 3/3, Train Loss: 0.151616733649898, Validation MSE: 0.13571403920650482


### Example of the predict

In [None]:
sentiment_tuner.predict(data['text'][500])

0.2810274809598923

In [None]:
data['text'][500]

"it's the 1930s -- the time of the great depression . this might lead one to believe that the film is a hard-hitting indictment of politics run amuck , but it isn't . everything is rushed . robbins then quickly cuts to an entirely different group of actors telling another part of the story . among the many stereotyped characters in the cluttered script are a giddy liberal socialite , countess lagrange ( vanessa redgrave ) , who gets her strike news delivered on a silver platter by her servants . none of the characters are compelling , and , if the movie had aired on television , i'm sure i would have turned it off long before it was over . very much in the spirit of illuminata , which opened this past summer , the fast-paced cradle will rock takes a comedic backstage look at the production of a play . and like illuminata , although it has its moments , it never comes together in anything approaching a satisfying whole . i would be inclined to suggest that you just wait for the video an

In [None]:
data['rating'][500]

0.5