# Hugging Face

* Train a model using Hugging Face

In [69]:
import torch
from google.cloud import storage
import sys
import os
import logging
from code_intelligence import gcs_util
import io
from sklearn import model_selection
import numpy as np
import datettime

In [None]:
import notebook_setup

notebook_setup.setup()

# Set the Python Path

In [14]:
py_dir = os.path.abspath(os.path.join(os.getcwd(), "..", "..", "py"))

if not py_dir in sys.path:
    logging.info(f"Adding {py_dir} to path")
    sys.path.append(py_dir)

In [3]:
# class Issues:
#     def __init__(self):
#         # The path to the GCS file containing the text
#         self.text_file = ""
#         # List of labels to apply to this file.
#         self.labels = []
        
class GCSDataset(torch.utils.data.Dataset):
    """Implement a dataset class which reads the files from GCS."""
    def __init__(self, input_files, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

## Load the Data

In [21]:
# CSV file containing a list of files and the labels
# This file is produced by the notebook automl.ipynb
files_list = "gs://issue-label-bot-dev_automl/automl_TCN8830229559715561472/dataset_201103_151334.csv"

bucket_name, obj_path = gcs_util.split_gcs_uri(files_list)

In [20]:
storage_client = storage.Client()

In [24]:
b = storage_client.bucket(bucket_name)

In [25]:
o = b.blob(obj_path)

In [30]:
csv_contents = o.download_as_string()

In [32]:
import pandas as pd

In [40]:
files = pd.read_csv(io.BytesIO(csv_contents), names=["garbage", "file", "labels"])

In [57]:
files["text"] = ""

### Load the actual file contents

In [74]:
# Compute the number of items related to progress
percent_interval = 1
progress = np.floor(files.shape[0] * percent_interval / 100.0)

In [72]:
# TODO(jlewi): Reading from GCS may not be efficient. The GCS data is obtained in automl.ipynb by creating a dataframe from BigQuery
# It might be faster just to read the data directly from BigQuery and if necessary store the dataframe as an hdf5 file
start = datetime.datetime.now()

for index in range(files.shape[0]):
    if index % progress == 0:        
        percent = index/files.shape[0] * 100
        elapsed = datetime.datetime.now() - start
        print(f"Percent done {percent}; Processing {index+1} of {files.shape[0]}; elapsed {elapsed}")
    bucket_name, obj_path = gcs_util.split_gcs_uri(files_list)
    b = storage_client.bucket(bucket_name)
    o = b.blob(obj_path)
    files.at[index, "text"] = o.download_as_string()

Percent done 0.0; Processing 1 of 9890


In [83]:
def split_labels(x):
    if isinstance(x, float):
        return []
    
    return [i.strip() for i in x.split(",")]

In [113]:
files["labels"] = files["labels_raw"].apply(split_labels)

In [86]:
# split the data into train and test sets
train_files, val_files = model_selection.train_test_split(files, test_size=.2)

In [128]:
# Generate a unique list of labels 
target_labels = functools.reduce(np.union1d, files["labels"].values)

## Train a model 

* Use [simpletransformers](https://github.com/ThilinaRajapakse/simpletransformers/tree/master/simpletransformers/classification) which is built ontop of HuggingFace
* You can check out this [example](https://github.com/huggingface/transformers/blob/master/examples/text-classification/run_glue.py) which just uses HuggingFace
* This code is based on the vscode [issue label model](https://github.com/microsoft/vscode-github-triage-actions/blob/master/classifier-deep/train/vm-filesystem/classifier/generateModels.py)

In [49]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd
import logging
import json
import os



In [50]:
# TODO(jlewi): This was copied code from vscode. What do we want to do with the logging module?
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [2]:
category = "area"

model_args = ClassificationArgs(
    output_dir=category + "_model",
    best_model_dir=category + "_model_best",
    overwrite_output_dir=True,
    train_batch_size=16,
    eval_batch_size=32,
    max_seq_length=256,
    num_train_epochs=2,
    save_model_every_epoch=False,
    save_eval_checkpoints=False,    
)

def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average="micro")

# Create a ClassificationModel
model = ClassificationModel(
    # TODO(jlewi): bert finetuned isn't found.
    # So I used what I found in https://github.com/ThilinaRajapakse/simpletransformers/blob/master/examples/text_classification/multilabel_classification.py
    # "bert", "finetuned", 
    "roberta", "roberta-base",
    num_labels=len(target_labels), args=model_args,
    use_cuda=False,
)

# Train the model
model.train_model(
    train_df, eval_df=test_df, output_dir=category + "_model/checkpoints",    
)

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(
    test_df,
    output_dir=category + "_model/eval",
    f1=f1_multiclass,
    acc=accuracy_score,
)

with open(os.path.join(category + "_model", "target_names.json"), "w") as f:
    json.dump(target_names, f)

NameError: name 'ClassificationArgs' is not defined

In [3]:
files

NameError: name 'files' is not defined