# PLM

In [1]:
import os
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

# Function to read and clean data for one review
def process_file(file_path, label):
    with open(file_path, 'r', encoding='utf-8') as file:
        review_text = file.read()
    return review_text, label

# Function to read the data in parallel
def load_data(data_dir):
    reviews = []
    labels = []
    tasks = []
    
    with ThreadPoolExecutor() as executor:
        for label_type in ['pos', 'neg']:
            
            dir_name = os.path.join(data_dir, label_type)
            label = 1 if label_type == 'pos' else 0
            
            for file_name in os.listdir(dir_name):
                file_path = os.path.join(dir_name, file_name)
                # Submit tasks to process files in parallel
                tasks.append(executor.submit(process_file, file_path, label))
        
        # Collect results as tasks complete
        for task in as_completed(tasks):
            review, label = task.result()
            reviews.append(review)
            labels.append(label)
    
    return reviews, labels

# Load train and test datasets
train_reviews, train_labels = load_data('train')
test_reviews, test_labels = load_data('test')

# View an example of a review
print(f"Label: {train_labels[0]}")
print(" ".join(train_reviews[0].split()[:100])) 


Label: 1
I felt compelled to write a review for Space Cobra as it has received a good score of 7.3 stars but only a few of the reviews at the time of me writing this were particularly positive. A strange situation and hopefully my positive review will point people towards this old and mostly forgotten Anime movie. Space cobra is the funky tale of a smuggler and rogue who becomes involved with the three sisters of an ancient and dead planet and an evil force who wants to harness the planets powers. This is an old movie and the animation shows,


In [None]:
import torch
import matplotlib.pyplot as plt
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.optim as optim
from torch.nn.functional import softmax

# Initialize DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the datasets
def tokenize_data(reviews, labels):
    encoding = tokenizer(reviews, padding=True, truncation=True, max_length=250, return_tensors='pt')
    labels = torch.tensor(labels)
    return TensorDataset(encoding['input_ids'], encoding['attention_mask'], labels)

train_dataset = tokenize_data(train_reviews, train_labels)
test_dataset = tokenize_data(test_reviews, test_labels)

# Create DataLoader for batching
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Load pre-trained DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
epochs = 5


In [None]:
# Training function
def train_model(model, train_loader, optimizer, epochs):
    model.train()
    loss_values = []
    
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels = [item for item in batch]
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
        
        avg_loss = total_loss / len(train_loader)
        loss_values.append(avg_loss)
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {avg_loss}')

    # Plotting the Training Convergence Plot
    plt.plot(range(1, len(loss_values) + 1), loss_values)
    plt.title('Training Loss Convergence')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.grid(True)
    plt.show()

# Train the model
train_model(model, train_loader, optimizer, epochs)

In [None]:
# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [item for item in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(softmax(logits, dim=1), dim=1)
            all_preds.extend(preds.numpy())
            all_labels.extend(labels.numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f'Test Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')

# Evaluate the model
evaluate_model(model, test_loader)