In [1]:
pip install simpletransformers==0.64.3

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from functools import partial
import sklearn 
import torch
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging
import datetime
pd.set_option('display.max_colwidth', None)
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
torch.multiprocessing.set_sharing_strategy('file_system')
cuda_available = torch.cuda.is_available()
print("Is cuda available?", cuda_available)

Is cuda available? False


In [3]:
import torch.multiprocessing
torch.multiprocessing.set_start_method('spawn', force=True)

In [4]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [5]:
train = pd.read_csv('data/train_clean_concat_200_example.csv')
test = pd.read_csv('data/test_clean_concat_200_example.csv')
print('Number of issues: ', train.shape, test.shape)

Number of issues:  (2500, 2) (500, 2)


In [6]:
train.head()

Unnamed: 0,text,labels
0,time 2021-05-26T20:57:00Z author OWNER repo 0-Yama/Projet-Final-Python title the beginning body we need to add the first file of our project,1
1,time 2020-07-26T20:14:04Z author OWNER repo 0-vortex/dotfiles title better touch tool configuration body export better touch tool configuration as a separate package or submodule,1
2,time 2020-12-04T09:33:18Z author NONE repo 00-Evan/shattered-pixel-dungeon title request text when wand of regrowth starts producing furrowed grass body i was super confused when this happened. a bit of text would have helped me realize what was going on a lot sooner. the wand is getting old and the new grass is not as healthy. ?,1
3,time 2020-09-22T01:49:32Z author OWNER repo 003MattB/ScheduleBuilderImproved title update total credits when cards are removed body total credits is only updated when adding new courses but is never updated when cards are removed,0
4,time 2020-09-22T01:52:14Z author OWNER repo 003MattB/ScheduleBuilderImproved title delete course from matrix body there is no way to remove courses from the matrix once they have been added. don't forget to update the total credits see issue,1


In [7]:
test.head()

Unnamed: 0,text,labels
0,time 2020-04-07T09:08:50Z author NONE repo tlnagy/TIFF.jl title error keyerror key tiff.sampleformat_int 0x0008 not found body one more error might need to be caught. 4d series.ome.tif is sample file from ome tiff website https docs.openmicroscopy.org ome model 6.0.0 ome tiff data.html . julia julia tiff.load 4d series.ome.tif error keyerror key tiff.sampleformat_int 0x0008 not found stacktrace 1 getindex dict tuple tiff.sampleformats int64 datatype tuple tiff.sampleformats uint16 at . dict.jl 477 2 output tiff.ifd uint32 at home hf .julia dev tiff src ifds.jl 113 3 load string at home hf .julia dev tiff src load.jl 14 4 top level scope at repl 2 1,1
1,time 2020-11-27T07:17:21Z author OWNER repo tisboyo/Twitch_Bot title add database backup to dropbox body nan,1
2,time 2021-01-02T19:35:34Z author OWNER repo DrWhoCares/imgdanke title add a button method to open the source or output folders body could also add a method to open up path to each file in the file list by right clicking. also an option to open up the path to the imgdanke.exe.,1
3,time 2021-01-02T20:55:34Z author OWNER repo DrWhoCares/imgdanke title processes are being started twice body at some point i refactored a few things and ended up leaving in an additional call to function that should've just been replaced by the using process process function call.,0
4,time 2020-12-29T15:34:35Z author OWNER repo Bean-1/AOT title cannot add hp to wall body nan,0


In [8]:
lr = 3e-5
drp = 0
epochs = 4
batch_t = 100
batch_e = 100
max_seq = 200
name = 'roberta'
ver = 'roberta-base'
output_name = 'outputs/' + name

def create_model(name, ver, lr, drp, epochs, batch_t, batch_e, max_seq):
    model_args = ClassificationArgs()
    model_name = name
    model_version = ver
    model_args.learning_rate = lr
    model_args.num_train_epochs = epochs
    model_args.eval_batch_size = batch_t
    model_args.train_batch_size = batch_e
    model_args.max_seq_length = max_seq
    model_args.n_gpu = 2
    # model_args.n_gpu = 1
    model_args.output_dir = output_name +'/'
    model_args.overwrite_output_dir = True
    model_args.reprocess_input_data = True
    model_args.preprocess_inputs = True
    model_args.save_steps = -1
    model_args.save_model_every_epoch = False
    
    model = ClassificationModel(model_name, model_version, args = model_args, 
                                num_labels = 3, 
                                use_cuda=cuda_available)
 
    return model

In [9]:
def calc(p1, p2, func, **kwargs):
    return func(p1, p2, **kwargs)

metrics_recom = {
    "accuracy": partial(calc,func=sklearn.metrics.accuracy_score) ,
    "p_micro": partial(calc,func=sklearn.metrics.precision_score,average='micro'),
    "p_macro": partial(calc,func=sklearn.metrics.precision_score,average='macro'),
    "p_w": partial(calc,func=sklearn.metrics.precision_score,average='weighted'),
    "r_micro": partial(calc,func=sklearn.metrics.recall_score,average='micro'),
    "r_macro": partial(calc,func=sklearn.metrics.recall_score,average='macro'),
    "r_w": partial(calc,func=sklearn.metrics.recall_score,average='weighted'),     
    "f_micro": partial(calc,func=sklearn.metrics.f1_score,average='micro'),
    "f_macro": partial(calc,func=sklearn.metrics.f1_score,average='macro'),
    "f_w": partial(calc,func=sklearn.metrics.f1_score,average='weighted'),
    "classificationReport": partial(calc,func=sklearn.metrics.classification_report, output_dict=True)
}

In [10]:
model = create_model(name, ver, lr, drp, epochs, batch_t, batch_e, max_seq)
start = datetime.datetime.now()
print('-'*5,  name, ', start time:', 
datetime.datetime.strftime(datetime.datetime.today(), '%d/%m/%Y-%H:%M'), '-'*5)
model.train_model(train_df=train, **metrics_recom)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


----- roberta , start time: 05/05/2024-19:07 -----


  0%|          | 0/2500 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_roberta_200_3_2


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/25 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/25 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/25 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/25 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/roberta/.


(100, 0.6794804319739342)

In [11]:
hours, remainder = divmod((datetime.datetime.now() - start).total_seconds(), 3600)
minutes, seconds = divmod(remainder, 60)
print('-'*5,  name, ', duration is:', '%dh:%dm:%ds' % (hours, minutes, seconds), '-'*5, '\n\n')
results, model_outputs, wrong_pred = model.eval_model(test, verbose=True, **metrics_recom)
results

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


----- roberta , duration is: 0h:27m:6s ----- 




  0%|          | 0/500 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_200_3_2


Running Evaluation:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.7132435817914078, 'accuracy': 0.836, 'p_micro': 0.836, 'p_macro': 0.783562515915457, 'p_w': 0.8369477463712758, 'r_micro': 0.836, 'r_macro': 0.7915103496961905, 'r_w': 0.836, 'f_micro': 0.836, 'f_macro': 0.7873341403475599, 'f_w': 0.836344399809431, 'classificationReport': {'0.0': {'precision': 0.8487394957983193, 'recall': 0.8632478632478633, 'f1-score': 0.8559322033898306, 'support': 234.0}, '1.0': {'precision': 0.8590909090909091, 'recall': 0.8362831858407079, 'f1-score': 0.8475336322869955, 'support': 226.0}, '2.0': {'precision': 0.6428571428571429, 'recall': 0.675, 'f1-score': 0.6585365853658537, 'support': 40.0}, 'accuracy': 0.836, 'macro avg': {'precision': 0.783562515915457, 'recall': 0.7915103496961905, 'f1-score': 0.7873341403475599, 'support': 500.0}, 'weighted avg': {'precision': 0.8369477463712758, 'recall': 0.836, 'f1-score': 0.836344399809431, 'support': 500.0}}, 'eval_loss': 0.45079139471054075}


{'mcc': 0.7132435817914078,
 'accuracy': 0.836,
 'p_micro': 0.836,
 'p_macro': 0.783562515915457,
 'p_w': 0.8369477463712758,
 'r_micro': 0.836,
 'r_macro': 0.7915103496961905,
 'r_w': 0.836,
 'f_micro': 0.836,
 'f_macro': 0.7873341403475599,
 'f_w': 0.836344399809431,
 'classificationReport': {'0.0': {'precision': 0.8487394957983193,
   'recall': 0.8632478632478633,
   'f1-score': 0.8559322033898306,
   'support': 234.0},
  '1.0': {'precision': 0.8590909090909091,
   'recall': 0.8362831858407079,
   'f1-score': 0.8475336322869955,
   'support': 226.0},
  '2.0': {'precision': 0.6428571428571429,
   'recall': 0.675,
   'f1-score': 0.6585365853658537,
   'support': 40.0},
  'accuracy': 0.836,
  'macro avg': {'precision': 0.783562515915457,
   'recall': 0.7915103496961905,
   'f1-score': 0.7873341403475599,
   'support': 500.0},
  'weighted avg': {'precision': 0.8369477463712758,
   'recall': 0.836,
   'f1-score': 0.836344399809431,
   'support': 500.0}},
 'eval_loss': 0.45079139471054075

In [12]:
pip install simpletransformers

Note: you may need to restart the kernel to use updated packages.


In [13]:
print(simpletransformers.__version__)

NameError: name 'simpletransformers' is not defined