In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "garbage_collection_threshold:0.6,max_split_size_mb:128"

In [None]:
import pandas as pd

df = pd.read_csv('data_preprocessing/data/processed_reviews.csv')

df.head()

In [None]:
# Split data into training(80%), testing(10%), and validation(10%)
from sklearn.model_selection import train_test_split

# Split the dataset into training (80%) and testing (20%) sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# further split the training dataset into training (80%) and validation (20%) sets
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# print the number of records in each set
print("Number of records in training set: ", len(train_df))
print("Number of records in validation set: ", len(val_df))
print("Number of records in testing set: ", len(test_df))

In [None]:
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer
from torch.nn.utils.rnn import pad_sequence

class ReviewsDataset(Dataset):
    def __init__(self, data, transformer):
        self.data = data
        self.transformer = transformer
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        input_text = self.data.iloc[index]['lemmatized_text']

        encoded_input = self.transformer.encode(input_text, add_special_tokens=True)

        input_tensor = torch.tensor(encoded_input)

        # Pad input_sequence to same length
        # padded_input = pad_sequence([tokenized_input], batch_first=True)

        output_labels = [
            float(self.data.iloc[index]['stars']),
            float(self.data.iloc[index]['useful']),
            float(self.data.iloc[index]['funny']),
            float(self.data.iloc[index]['cool']),
        ]

        # Convert the output labels to a tensor of floats
        output_labels = torch.FloatTensor(output_labels)

        # padded_output_labels = pad_sequence(output_labels, batch_first=True, padding_value=0)

        return input_tensor, output_labels




In [None]:
train_df = train_df.dropna(axis=0)

if train_df['lemmatized_text'].isna().any():
    print("The training dataFrame contains np.nan values in the lemmatized_text column.")
else:
    print("The training dataFrame does not contain np.nan values in the lemmatized_text column.")


if train_df['stars'].isna().any():
    print("The training dataFrame contains np.nan values in the stars column.")
else:
    print("The training dataFrame does not contain np.nan values in the stars column.")


if train_df['useful'].isna().any():
    print("The training dataFrame contains np.nan values in the useful column.")
else:
    print("The training dataFrame does not contain np.nan values in the useful column.")
    

if train_df['funny'].isna().any():
    print("The training dataFrame contains np.nan values in the funny column.")
else:
    print("The training dataFrame does not contain np.nan values in the funny column.")
    

if train_df['cool'].isna().any():
    print("The training dataFrame contains np.nan values in the cool column.")
else:
    print("The training dataFrame does not contain np.nan values in the cool column.")

train_df.head()



In [None]:
train_df['lemmatized_text']

In [None]:
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer

batch_size = 16

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    
def collate_fn(batch):
    inputs = [item[0] for item in batch]
    outputs = [item[1] for item in batch]
    padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
    padded_outputs = torch.stack(outputs)
    return padded_inputs, padded_outputs


train_dataset = ReviewsDataset(train_df, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

val_dataset = ReviewsDataset(val_df, tokenizer)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

test_dataset = ReviewsDataset(test_df, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)



In [None]:
import torch

device = torch.device('cpu')
# device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# if device.type == 'cuda':
#     torch.cuda.set_device(0)

print("Device available for running: ")
print(device)

In [None]:
import torch
from transformers import GPT2Model

class GPT2SentimentAnalysis(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.gpt2 = GPT2Model.from_pretrained('gpt2')
        self.output_layer = torch.nn.Linear(self.gpt2.config.hidden_size, 4)
        # self.fc1 = torch.nn.Linear(self.gpt2.config.n_embd, 4)
        # self.dropout = torch.nn.Dropout(0.1)
        
    def forward(self, x_ins):
        # # hidden_state = self.gpt2(x_ins)
        # outputs = self.gpt2(x_ins)
        # # last_hidden_state = outputs.last_hidden_state
        # # output = self.output_layer(last_hidden_state[:, 0, :]) # extract the first token (CLS token) from the sequence
        # last_hidden_state = outputs.last_hidden_state[:, 0, :]
        # pooled_output = self.dropout(last_hidden_state)
        # logits = self.fc1(pooled_output)
        # # output = self.fc1(hidden_state[:, -1, :])
        # return logits
    
        outputs = self.gpt2(x_ins)
        last_hidden_state = outputs.last_hidden_state
        output = self.output_layer(last_hidden_state[:, 0, :]) # extract the first token (CLS token) from the sequence
        return output

    def train(self, train_dataloader, num_epochs, device):
        self.to(device)
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-5)
        loss_fn = torch.nn.MSELoss(reduction='mean')
        for epoch in range(num_epochs):
            for batch in train_dataloader:
                batch = [t.to(device) for t in batch]
                review_text, labels = batch
                output = self.forward(review_text)
                loss = loss_fn(output, labels)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                print('Epoch:', epoch, 'Batch loss:', loss.item())

model = GPT2SentimentAnalysis()



In [None]:
num_epochs = 10

model.train(train_dataloader, num_epochs, device)