In [1]:
import argparse
import os
import pandas as pd
import numpy as np
from transformers import BertTokenizer,BertForSequenceClassification,Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn import preprocessing
import torch.nn as nn
from sklearn.utils import compute_class_weight
from sklearn.metrics import f1_score, classification_report
import pickle


In [2]:
os.environ["WANDB_MODE"]="disabled"

In [3]:
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    

No GPU available, using the CPU instead.


In [4]:
le = preprocessing.LabelEncoder()
# use this to map categories to integers.


functions and class definitions


In [5]:
class MulticlassDataset(Dataset):

    def __init__(self, encodings, labels):
      self.encodings = encodings
      self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)
        

In [6]:
def preprocess(dataframe, selected_material_type=None):
  #convert the dataframe labels accordingly by the material type
  if selected_material_type!="None":
    new_df = dataframe.copy()
    for _, row in new_df.iterrows():
      if row['description_material'].split("_")[0] == selected_material_type:
        continue #leave the label
      else:
        row['description_material']="None" #set none as label
  else:
    new_df = dataframe.copy()   #flattened label version 
  new_df
  #convert labels into integers
  le.fit(new_df.description_material)

  new_df['description_material'] = le.transform(new_df.description_material)
  print(" number of labels: ", len(le.classes_))
  #split data to training df, val df, test df
  train_df, dev_df, test_df =  np.split(new_df.sample(frac=1, random_state=42),[int(.6*len(new_df)), int(.8*len(new_df))])
 
  return train_df, dev_df, test_df


def create_dataset(dataframe, tokenizer):
  MAX_LENGTH = 64
  inputs = {
          "input_ids":[],
          "attention_mask":[]
        }
  features_columns =[x for x in dataframe.columns.values if x != 'description_material' and x.startswith("description")]
  def create_concatenated_text(dataframe):
    """combine the columns text to create a single sentence"""
    sents= [] #text that is a concatenation of all columns
    for _, row in dataframe.iterrows():
      combined = ""
      for col in features_columns:
        row_value = row[col]
        if row_value!="" and type(row_value)==str:
          combined+= row_value +" , "
      sents.append(combined)
    return sents
  sents = create_concatenated_text(dataframe)
  for sent in sents:
    tokenized_input = tokenizer(sent,max_length=MAX_LENGTH, padding='max_length', truncation=True)
    inputs["input_ids"].append(torch.tensor(tokenized_input["input_ids"]))
    inputs["attention_mask"].append(torch.tensor(tokenized_input["attention_mask"]))

  labels = torch.tensor(dataframe['description_material'].values.tolist())

  return MulticlassDataset(inputs,labels)

def get_class_weights(dataframe):
  """computes the class weight and returns a list to account for class imbalance """
  labels = torch.tensor(dataframe['description_material'].values.tolist())
  class_weights=compute_class_weight( class_weight ='balanced',classes = np.unique(labels),y = labels.numpy())

  total_class_weights =torch.tensor(class_weights,dtype=torch.float).to(device)
  return total_class_weights

def create_custom_trainer(class_weights):
  """creates custom trainer that accounts for class imbalance"""
  class CustomTrainer(Trainer):
      def compute_loss(self, model, inputs, return_outputs=False):
          labels = inputs.get("labels")
          # forward pass
          outputs = model(**inputs)
          logits = outputs.get("logits")
          # compute custom loss 
          loss_fct = nn.CrossEntropyLoss(weight=class_weights)
          loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
          return (loss, outputs) if return_outputs else loss
  return CustomTrainer

def train(selected_type, dataframe, tokenizer, batch_size, learning_rate, epochs,train_mode, output_dir):

  train_df, dev_df, test_df = preprocess(dataframe,selected_type)
  train_dataset = create_dataset(train_df, tokenizer)
  dev_dataset = create_dataset(dev_df,tokenizer)
  test_dataset = create_dataset(test_df,tokenizer)

  #load model
  model = BertForSequenceClassification.from_pretrained("allenai/scibert_scivocab_uncased", num_labels = len(le.classes_), )

  # Tell pytorch to run this model on the GPU.
  desc = model.cuda()

  training_args = TrainingArguments(
          output_dir= output_dir,     # output directory
          num_train_epochs=epochs,              # total number of training epochs
          per_device_train_batch_size=batch_size,  # batch size per device during training
          per_device_eval_batch_size=batch_size,   # batch size for evaluation
          learning_rate = learning_rate,
          warmup_steps=500,                # number of warmup steps for learning rate scheduler
          weight_decay=0.01, 
          load_best_model_at_end=True,            
          logging_dir=output_dir,            # directory for storing logs
          logging_steps=10,
          evaluation_strategy = "epoch", #To calculate metrics per epoch
          save_strategy = "epoch"
  )
  #get class weight
  class_weights = get_class_weights(train_df)
  CustomTrainer = create_custom_trainer(class_weights)

  if train_mode == "custom":
    trainer = CustomTrainer(model = model, args =training_args, train_dataset=train_dataset, eval_dataset=dev_dataset)
  else:
    trainer = Trainer(model = model, args =training_args, train_dataset=train_dataset, eval_dataset=dev_dataset)
  trainer.train()



In [7]:
## Required parameters
nb_epochs = int(2)

batch_size = int(10)
lr_rate = float(0.01)

material_type = str('')

train_mode = str('FALSE'),
 #  Whether we account for class imbalance during training by using a custom trainer 
    # (custom) or not (none)                  
output_dir =str('output')
 #Output directory where the model checkpoint will be saved
    

In [8]:
df = pd.read_csv("iSamplesMaterialTrainingSmall.csv")
df = df.fillna("")
    #remove rows that do not have a material type
df = df[df["description_material"]!=""]

In [9]:
  #load tokenizer 
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased', do_lower_case=True, use_fast=True)

In [10]:
#load tokenizer
# train(material_type, df, tokenizer, batch_size,lr_rate, nb_epochs, train_mode, output_dir)

# insert train function in line here for debugging...
train_df, dev_df, test_df = preprocess(df,'None')


 number of labels:  3


In [11]:
train_dataset = create_dataset(train_df, tokenizer)
dev_dataset = create_dataset(dev_df,tokenizer)
test_dataset = create_dataset(test_df,tokenizer)

In [12]:
# load model
model = BertForSequenceClassification.from_pretrained("allenai/scibert_scivocab_uncased", num_labels = len(le.classes_), )

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

In [13]:
  # Tell pytorch to run this model on the GPU.
    # have to use the GPU check from the beginning...
# desc = model.cuda()
# Using the `Trainer` with `PyTorch` requires `accelerate`: Run `pip install --upgrade accelerate`
# is this because we don't have a GPU?
desc = model.to(device)

In [14]:
training_args = TrainingArguments(
          output_dir= output_dir,     # output directory
          num_train_epochs=nb_epochs,              # total number of training epochs
          per_device_train_batch_size=batch_size,  # batch size per device during training
          per_device_eval_batch_size=batch_size,   # batch size for evaluation
          learning_rate = lr_rate,
          warmup_steps=500,                # number of warmup steps for learning rate scheduler
          weight_decay=0.01, 
          load_best_model_at_end=True,            
          logging_dir=output_dir,            # directory for storing logs
          logging_steps=10,
          evaluation_strategy = "epoch", #To calculate metrics per epoch
          save_strategy = "epoch"
  )

In [15]:
  #get class weight
class_weights = get_class_weights(train_df)


In [17]:
CustomTrainer = create_custom_trainer(class_weights)
if train_mode == "custom":
    trainer = CustomTrainer(model = model, args =training_args, train_dataset=train_dataset, eval_dataset=dev_dataset)
else:
    trainer = Trainer(model = model, args =training_args, train_dataset=train_dataset, eval_dataset=dev_dataset)

In [18]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.335818
2,0.812100,0.248678


TrainOutput(global_step=14, training_loss=0.7254903997693744, metrics={'train_runtime': 2034.3078, 'train_samples_per_second': 0.062, 'train_steps_per_second': 0.007, 'total_flos': 4144036329216.0, 'train_loss': 0.7254903997693744, 'epoch': 2.0})