In [None]:
import bz2
from tqdm import tqdm
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

### Extract bz2 files

In [None]:

path_train='./data/train.ft.txt'

with bz2.open('./data/train.ft.txt.bz2', 'rt', encoding='utf-8') as compressed_file, open(path_train, 'w', encoding='utf-8') as output_file:
    for line in compressed_file:
        output_file.write(line)
    print('Train data export successful.')
    
path_test='./data/test.ft.txt'

with bz2.open('./data/test.ft.txt.bz2', 'rt', encoding='utf-8') as compressed_file, open(path_test, 'w', encoding='utf-8') as output_file:
    for line in compressed_file:
        output_file.write(line)
    print('Test data export successful.')

### Check txt files

In [None]:
# Specify the number of lines you want to display
num_lines = 5  # Change this number as needed

# Open the file and read the first 'num_lines' lines
with open(path_train, 'r') as file:
    for i, line in enumerate(file):
        if i < num_lines:
            print(line.strip())  # Strip removes trailing newline characters
        else:
            break

In [None]:
train,test,train_label,test_label=[],[],[],[]
with open(path_train, 'r', encoding='utf-8') as file:
    lines = file.readlines()
for line in tqdm(lines):
    train.append(line.split('__label__')[1][1:])
    train_label.append(line.split('__label__')[1][0])
with open(path_test, 'r', encoding='utf-8') as file:
    lines = file.readlines()
for line in tqdm(lines):
    test.append(line.split('__label__')[1][1:])
    test_label.append(line.split('__label__')[1][0])

In [None]:
# select a random sample for faster processing
import random
seed = 123

# Randomly select elements from the train list
random.seed(seed)
train_indices = random.sample(range(len(train)), 4000)
train = [train[i] for i in train_indices]
train_label = [train_label[i] for i in train_indices]

# Randomly select elements from the test list
random.seed(seed)
test_indices = random.sample(range(len(test)), 200)
test = [test[i] for i in test_indices]
test_label = [test_label[i] for i in test_indices]

In [None]:
import pickle

# Save to pickle files
with open('data/train.pkl', 'wb') as f:
    pickle.dump(train, f)
with open('data/train_label.pkl', 'wb') as f:
    pickle.dump(train_label, f)
with open('data/test.pkl', 'wb') as f:
    pickle.dump(test, f)
with open('data/test_label.pkl', 'wb') as f:
    pickle.dump(test_label, f)

In [None]:
# Load from pickle files
with open('data/train.pkl', 'rb') as f:
    train = pickle.load(f)
with open('data/train_label.pkl', 'rb') as f:
    train_label = pickle.load(f)
with open('data/test.pkl', 'rb') as f:
    test = pickle.load(f)
with open('data/test_label.pkl', 'rb') as f:
    test_label = pickle.load(f)

In [None]:
def clean_text(text):
    # Remove non-alphanumeric characters and extra whitespace
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert multiple whitespace characters to a single space
    text = re.sub(r'\s+', ' ', text)
    # Convert the text to lowercase
    text = text.lower()
    return text

In [None]:
print('Train Length',len(train))
print('Train Label Length',len(train_label))
print('Test Length',len(test))
print('Test Label Length',len(test_label))

In [None]:
train_label[0],train[0]

In [None]:
train_label[0],clean_text(train[0])

In [None]:
train=pd.DataFrame(train)[0].apply(clean_text)
test=pd.DataFrame(test)[0].apply(clean_text)
train.head()

In [None]:
from transformers import BertTokenizer

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and pad sequences
def tokenize_and_pad(text, max_len):
    tokenized = tokenizer.encode_plus(
        text, 
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    return tokenized["input_ids"], tokenized["attention_mask"]

max_len = 512

# Tokenize the sequences
train_tokenized = [tokenize_and_pad(text, max_len) for text in train]
test_tokenized = [tokenize_and_pad(text, max_len) for text in test]


Save to pytorch files:

In [None]:
train_label_int = [int(t)-1 for t in train_label]
test_label_int = [int(t)-1 for t in test_label]

In [None]:
import torch

# Convert train_labels to a tensor
train_labels_tensor = torch.tensor(train_label_int)
test_labels_tensor = torch.tensor(test_label_int)

# Save train_tokenized and train_labels_tensor
torch.save((train_tokenized, train_labels_tensor), 'data/train_data.pt')
torch.save((test_tokenized, test_labels_tensor), 'data/test_data.pt')

# Modelling

In [None]:
import torch
from transformers import BertForSequenceClassification

# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Load the saved data
train_data = torch.load('data/train_data.pt')
test_data = torch.load('data/test_data.pt')

# Separate the tokenized data and labels
train_tokenized, train_labels_tensor = train_data
test_tokenized, test_labels_tensor = test_data

# Convert the tokenized data into tensors
train_inputs = torch.stack([item[0] for item in train_tokenized]).squeeze()
train_masks = torch.stack([item[1] for item in train_tokenized]).squeeze()
test_inputs = torch.stack([item[0] for item in test_tokenized]).squeeze()
test_masks = torch.stack([item[1] for item in test_tokenized]).squeeze()

# Move everything to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
train_inputs = train_inputs.to(device)
train_masks = train_masks.to(device)
train_labels_tensor = train_labels_tensor.to(device)
test_inputs = test_inputs.to(device)
test_masks = test_masks.to(device)
test_labels_tensor = test_labels_tensor.to(device)

# Predict the labels for the test data
model.eval()
with torch.no_grad():
    outputs = model(test_inputs, attention_mask=test_masks)
    _, predicted_labels = torch.max(outputs.logits, 1)

print(predicted_labels)


Save model to pickle:

In [None]:
# Save the model
model.save_pretrained('data/model')

In [None]:
# load the model
model = BertForSequenceClassification.from_pretrained('data/model')

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Move tensors back to CPU for metric calculation
predicted_labels_np = predicted_labels.cpu().numpy()
test_labels_np = test_labels_tensor.cpu().numpy()

# Calculate metrics
accuracy = accuracy_score(test_labels_np, predicted_labels_np)
precision = precision_score(test_labels_np, predicted_labels_np)
recall = recall_score(test_labels_np, predicted_labels_np)
f1 = f1_score(test_labels_np, predicted_labels_np)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
