<a href="https://colab.research.google.com/github/katie291100/learn-nlp/blob/main/43_with_scheduleracc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment setup

In [None]:
!pip install transformers
!pip install nlpaug
!pip install sacremoses

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
from transformers import BertForSequenceClassification, AutoTokenizer, BertModel, get_linear_schedule_with_warmup
from torch import nn, cuda, tensor, max, optim, no_grad, device as torchDevice
import torch.nn.functional as nnf
import pandas as pd
import os
from torch.utils.data import DataLoader
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import nlpaug
import nlpaug.augmenter.word as naw

In [None]:
cuda.empty_cache()

# Set-up Training Values and Model

In [13]:
# Set up parameters for training
bert_model_name = 'bert-base-uncased'
num_classes = 5
batch_size = 16
num_epochs = 20
learning_rate = 4e-5

class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(self.bert.config.hidden_size, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, num_classes)
        # self.rnn = nn.LSTM(256, 128, 2)

    def forward(self, input_ids, attention_mask):
            output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            x = self.fc1(output.pooler_output)
            x=self.dropout(x)
            # x = nnf.relu(self.rnn(x))
            x = self.fc2(x)
            x=self.dropout(x)
            logits = self.fc3(x)
            return logits

device = torchDevice("cuda" if cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)



# Import data




In [None]:
folder_path = 'drive/MyDrive/storypoint/IEEE TSE2018/dataset'
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]
print(csv_files)

dataframes = []

for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

combined_df = pd.concat(dataframes, ignore_index=True)

['talendesb.csv', 'springxd.csv', 'clover.csv', 'aptanastudio.csv', 'datamanagement.csv', 'mulestudio.csv', 'moodle.csv', 'jirasoftware.csv', 'talenddataquality.csv', 'bamboo.csv', 'duracloud.csv', 'mesos.csv', 'usergrid.csv', 'titanium.csv', 'mule.csv', 'appceleratorstudio.csv']


In [None]:
pd.DataFrame(combined_df["storypoint"].value_counts())

Unnamed: 0,storypoint
5,4319
1,4225
3,3973
2,3406
8,3112
...,...
58,1
94,1
96,1
46,1


In [None]:
combined_df

Unnamed: 0,issuekey,title,description,storypoint
0,TESB-1,Investigation: S1 Improved user experience wit...,,5
1,TESB-2,Investigate: S2 Business Service Implementatio...,,5
2,TESB-3,Investigate: S3 Improved XML Data Handling Com...,,5
3,TESB-4,Investigate: S7 TIS RTx based on the Service F...,,5
4,TESB-5,Investigate: github migration new opportunitie...,,5
...,...,...,...,...
23308,TISTUD-8536,Support per-platform moduleAPIVersions,Studio uses the moduleAPIVersion to filter the...,5
23309,TISTUD-8538,Publishing: Block field input until 'Use Exist...,h5.Description When publishing an applicati...,3
23310,TISTUD-8540,Installation error while opening studio in the...,*To reproduce:* # Keep node version < 0.1.7 ...,3
23311,TISTUD-8541,"NPE while checking for ""Install Appcelerator S...",*To reproduce:* # Be in Offline mode # Lau...,3


# Split and prepare data for training and testing


In [None]:

combined_df = combined_df.query("description.notna()").get(["storypoint", "description", "title"])

# Define the bin edges and labels
bin_edges = [0, 2, 5, 13, 20, 100]
bin_labels = [1, 2, 3, 4, 5]

# Use pd.cut to classify the numbers into the specified bins and create a new column
combined_df['Class'] = pd.cut(combined_df['storypoint'], bins=bin_edges, labels=bin_labels)

# Display the resulting DataFrame
print(combined_df)

# Add title to start of description for improved context
combined_df['description'] = combined_df['title'] + ' ' + combined_df['description']

# Remove spare title column
combined_df = combined_df.drop(columns=['title'])

# Shuffle dataframe
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

new_combined_df = combined_df.query("Class==1")[:1000].append(combined_df.query("Class==2")[:1000]).append(combined_df.query("Class==3")[:1000]).append(combined_df.query("Class==4")[:1000]).append(combined_df.query("Class==5")[:1000])


x_values = new_combined_df['description'].tolist()
y_values = new_combined_df['Class'].tolist()

# Stratified Shuffle Split ensures that the Train/Test Sets have an event balance of each class

stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)


for i, (train_index, test_index) in enumerate(stratified_split.split(x_values, y_values)):
    train = new_combined_df.iloc[train_index]
    testdf = new_combined_df.iloc[test_index]



       storypoint                                        description  \
6               3  Code Repository: It is expected that we setup ...   
7               3  - Concept within our Wiki how the internal bui...   
8               8  for all of the team mebers who not worked with...   
9               2  Get familar and install Zookeeper (Apache Hado...   
10              3  Create a CXF Interceptor in a way that we coul...   
...           ...                                                ...   
23308           5  Studio uses the moduleAPIVersion to filter the...   
23309           3  h5.Description    When publishing an applicati...   
23310           3  *To reproduce:*  # Keep node version < 0.1.7  ...   
23311           3  *To reproduce:*    # Be in Offline mode  # Lau...   
23312           8  Appcelerator Studio should support installing ...   

                                                   title Class  
6      Common - Setup ESB Runtime Code Repositories (...     2  
7    

  new_combined_df = combined_df.query("Class==1")[:1000].append(combined_df.query("Class==2")[:1000]).append(combined_df.query("Class==3")[:1000]).append(combined_df.query("Class==4")[:1000]).append(combined_df.query("Class==5")[:1000])
  new_combined_df = combined_df.query("Class==1")[:1000].append(combined_df.query("Class==2")[:1000]).append(combined_df.query("Class==3")[:1000]).append(combined_df.query("Class==4")[:1000]).append(combined_df.query("Class==5")[:1000])
  new_combined_df = combined_df.query("Class==1")[:1000].append(combined_df.query("Class==2")[:1000]).append(combined_df.query("Class==3")[:1000]).append(combined_df.query("Class==4")[:1000]).append(combined_df.query("Class==5")[:1000])
  new_combined_df = combined_df.query("Class==1")[:1000].append(combined_df.query("Class==2")[:1000]).append(combined_df.query("Class==3")[:1000]).append(combined_df.query("Class==4")[:1000]).append(combined_df.query("Class==5")[:1000])


NameError: ignored

In [None]:
new_combined_df["Class"].value_counts()

1    1000
2    1000
3    1000
5     737
4     559
Name: Class, dtype: int64

In [None]:
# translator = nlpaug.augmenter.word.back_translation.BackTranslationAug(from_model_name='facebook/wmt19-en-de', to_model_name='facebook/wmt19-de-en', name='BackTranslationAug', device='cpu', batch_size=32, max_length=300, force_reload=False, verbose=0)

In [None]:
# print(train_df.iloc[1][1])
# translator.augment(train_df.iloc[1][1])

# for i in range(0, len(train_df)):


In [None]:
train["Class"].value_counts()


In [None]:
testdf["Class"].value_counts()

# Define Dataloader and tokeniser

In [14]:
class TextClassificationDataset(Dataset):
  """Dataset loader as needed for pytorch. Defines overrides for len, getitem and init of superclass"""
  def __init__(self, texts, labels, tokenizer, max_length):
          super().__init__()
          self.texts = texts
          self.labels = np.array(list(map(lambda x: [1,2,3,4,5].index(x), list(labels)))) #Converts class numbers into index labels
          self.tokenizer = tokenizer
          self.max_length = max_length
  def __len__(self):
      return len(self.texts)
  def __getitem__(self, idx):
      text = self.texts[idx]
      label = tensor(self.labels[idx]).to(device)
      encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
      encoding = encoding.to(device)
      return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': label}

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# train, testds = train_test_split(combined_df, test_size=0.2)

print(train["Class"].value_counts())
print(testdf["Class"].value_counts())


1    800
2    800
3    800
5    589
4    447
Name: Class, dtype: int64
1    200
2    200
3    200
5    148
4    112
Name: Class, dtype: int64


# Define test function

In [15]:
test_dataset = TextClassificationDataset(testdf['description'].values, testdf['Class'].values, tokenizer, 256)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
def test():
  model.eval()
  predictions = []
  actual_labels = []
  with no_grad():
      for i, batch in enumerate(test_dataloader):
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          labels = batch['label'].to(device)
          outputs = model(input_ids=input_ids, attention_mask=attention_mask)
          _, preds = max(outputs, dim=1)
          predictions.extend(preds.cpu().tolist())
          actual_labels.extend(labels.cpu().tolist())

  print(classification_report(actual_labels, predictions))

# Training loop

In [16]:
from sklearn.metrics import classification_report

train_dataset = TextClassificationDataset(train['description'].values, train['Class'].values, tokenizer, 256)
data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)
# loss = 0
for epoch in range(num_epochs):
  model.train()
  epoch_loss = 0

  for i, batch in enumerate(data_loader):
      optimizer.zero_grad()
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids=input_ids, attention_mask=attention_mask)
      loss = nn.CrossEntropyLoss()(outputs, labels)
      loss.backward()
      epoch_loss += loss.item()
      optimizer.step()
      scheduler.step()
  print("loss: ",epoch_loss/len(train))
  test()



loss:  1.4824313574059065


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.50      0.46      0.48       200
           1       0.32      0.55      0.40       200
           2       0.29      0.18      0.22       200
           3       0.00      0.00      0.00       112
           4       0.47      0.67      0.55       148

    accuracy                           0.39       860
   macro avg       0.32      0.37      0.33       860
weighted avg       0.34      0.39      0.35       860

loss:  1.2438742853874385
              precision    recall  f1-score   support

           0       0.52      0.61      0.56       200
           1       0.35      0.40      0.37       200
           2       0.33      0.43      0.38       200
           3       0.00      0.00      0.00       112
           4       0.63      0.57      0.60       148

    accuracy                           0.43       860
   macro avg       0.37      0.40      0.38       860
weighted avg       0.39      0.43      0.41       8

KeyboardInterrupt: ignored

In [None]:
report = classification_report(actual_labels, predictions)
print(report)
print(true_labels)
print(predictions)