<a href="https://colab.research.google.com/github/kailliang/Text-Classification-with-Transformers/blob/main/Transformer_Mitie.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
!nvidia-smi

Tue May 30 17:01:54 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import warnings
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler
from tqdm import tqdm
from transformers import AdamW, get_linear_schedule_with_warmup, BertTokenizer, BertForSequenceClassification
from transformers import DistilBertForSequenceClassification
from transformers import MobileBertForSequenceClassification
from transformers import AlbertForSequenceClassification
from transformers import RobertaForSequenceClassification

# To ignore warnings
warnings.filterwarnings('ignore')

random_state = 2023
epochs = 50
batch_size = 256
lr=5e-5

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/job_data.csv')

# Convert date columns to datetime type
date_columns = ['reported_date', 'target_finish', 'actual_finish']

# Drop rows with invalid datetime values

for col in date_columns:
    data[col] = pd.to_datetime(data[col], dayfirst=True, errors='coerce')

# Drop rows with invalid datetime values
data = data.dropna(subset=date_columns)


# Preprocessing: Handling missing data and encoding categorical variables
data['raised_within_workhours'].fillna('Unknown', inplace=True)
data['location_type'].fillna('Unknown', inplace=True)

# Encoding categorical variables
le = LabelEncoder()
data['raised_within_workhours'] = le.fit_transform(data['raised_within_workhours'])
# data['location_type'] = le.fit_transform(data['location_type'])

data.loc[:, 'location_type'] = le.fit_transform(data.loc[:, 'location_type'])

data['expected_duration'] = ((pd.to_datetime(data['target_finish']) - pd.to_datetime(data['reported_date'])).dt.total_seconds() / (3600 * 24)).astype(np.int64)

# Lable
data['on_time'] = (data['target_finish'] - data['actual_finish']).dt.total_seconds() / 3600 / 24
# data['on_time'] = data['on_time'].apply(lambda x: "Was the job expected to be finished on time: 1" if x > 0 else "Was the job expected to be finished on time: 0") 
data['on_time'] = data['on_time'].apply(lambda x: 1 if x > 0 else 0) 

# Create time features
data['reported_hour'] = data['reported_date'].dt.hour
data['reported_day_of_week'] = data['reported_date'].dt.dayofweek
data['reported_month'] = data['reported_date'].dt.month
data['reported_year'] = data['reported_date'].dt.year

data['target_hour'] = data['target_finish'].dt.hour
data['target_day_of_week'] = data['target_finish'].dt.dayofweek
data['target_month'] = data['target_finish'].dt.month
data['target_year'] = data['target_finish'].dt.year


# Convert to sentences
def convert_to_sentence(row):
    # return f"Job was reported with priority: {row['priority']}, location type: {row['location_type']}, was it raised within workhours: {row['raised_within_workhours']}. The reported hour is {row['reported_hour']}, day of week is {row['reported_day_of_week']}, month is {row['reported_month']}, and year is {row['reported_year']}. The target hour is {row['target_hour']}, day of week is {row['target_day_of_week']}, month is {row['target_month']}, and year is {row['target_year']}."
    # return f"Job was reported with priority: {row['priority']}, The reported hour is {row['reported_hour']}, day of week is {row['reported_day_of_week']}, month is {row['reported_month']}, and year is {row['reported_year']}. The target hour is {row['target_hour']}, day of week is {row['target_day_of_week']}, month is {row['target_month']}, and year is {row['target_year']}."
    return f"Job reported at date: {row['reported_date']}, with priority: {row['priority']}, location type: {row['location_type']}, was it raised within workhours: {row['raised_within_workhours']}, and target finish time: {row['target_finish']}."
    # return f"Job reported at date: {row['reported_date']}, with priority: {row['priority']}, and target finish time: {row['target_finish']}."


data['text'] = data.apply(convert_to_sentence, axis=1)


X = data['text']
y = data['on_time']

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
input_ids = [torch.tensor(tokenizer.encode(sent, add_special_tokens=True)) for sent in X]

input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)

# Create attention masks
attention_masks = []
for seq in input_ids:
    seq_mask = [float(i != 0) for i in seq]
    attention_masks.append(seq_mask)

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, y, random_state=42, test_size=0.2)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=42, test_size=0.2)

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

encoder = LabelEncoder()
train_labels_encoded = encoder.fit_transform(train_labels)
validation_labels_encoded = encoder.transform(validation_labels)

train_labels = torch.tensor(train_labels_encoded)
validation_labels = torch.tensor(validation_labels_encoded)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Compute class weights
counts = np.bincount(train_labels_encoded)
class_weights = 1. / counts
class_weights = torch.tensor(class_weights, dtype=torch.float)
class_weights = class_weights.to(device)

# Create an iterator of our data with torch DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

"""
models -------------------------------------------------------------------------------------------------------------------
"""

# model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(np.unique(train_labels_encoded)))

# model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(np.unique(train_labels_encoded)))

# model = MobileBertForSequenceClassification.from_pretrained('google/mobilebert-uncased', num_labels=len(np.unique(train_labels_encoded))) # 147M

# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(np.unique(train_labels_encoded)))

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(np.unique(train_labels_encoded)))

"""
------------------------------------------------------------------------------------------------------------------------ 
"""
loss_function = nn.CrossEntropyLoss(weight=class_weights)
optimizer = AdamW(model.parameters(), lr=lr)


scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*epochs)

# Train the model
model.to(device)

for epoch in range(epochs):  
  model.train()
  total_loss = 0
  for i, batch in enumerate(train_dataloader):
    optimizer.zero_grad()
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    labels = batch[2].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    total_loss += loss.item()  # accumulate the loss
    loss.backward()
    optimizer.step()
    scheduler.step()

  avg_train_loss = total_loss / len(train_dataloader)  # calculate the average loss over all batches

  model.eval()
  correct = 0
  total = 0
  predictions , true_labels = [], []

  for batch in validation_dataloader:
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    labels = batch[2].to(device)

    with torch.no_grad():
      outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predicted = torch.argmax(logits, dim=1)

    total += labels.size(0)
    correct += (predicted == labels).sum().item()

    predictions.extend(predicted.detach().cpu().numpy())
    true_labels.extend(labels.detach().cpu().numpy())
  if epoch%10 == 0:
    print(f'Epoch [{epoch+1}/{epochs}], Step [{i+1}/{len(train_dataloader)}], Loss: {loss.item()}')
    print('Accuracy of the model on the test set: %d %%' % (100 * correct / total))
  # Compute confusion matrix
    cf_matrix = confusion_matrix(true_labels, predictions)
    print('Confusion Matrix: \n', cf_matrix)



# Compute precision
precision = precision_score(true_labels, predictions, average='weighted')
print('Precision: ', precision)

lb = LabelBinarizer()
lb.fit(true_labels)
true_labels_bin = lb.transform(true_labels)
predictions_bin = lb.transform(predictions)
fpr, tpr, _ = roc_curve(true_labels_bin.ravel(), predictions_bin.ravel())
roc_auc = auc(fpr, tpr)
print('AUC: ', roc_auc)

# Plot ROC curve
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define the confusion matrix
cm = np.array([[30, 54], [44, 164]])

# Define the labels
labels = ['Positive', 'Negative']

# Create a dataframe for a better visualization
df_cm = pd.DataFrame(cm, columns=labels, index=labels)

plt.figure(figsize=(7,5))
sns.heatmap(df_cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
