In [None]:
!pip install transformers
!pip install -U tree-hugger PyYAML

In [None]:
#use this command to build the necessary processing libary (tree-hugger related)
!create_libs -c python

In [None]:
# We first download two companion files where we have some useful function and also the main model architecture code
!wget https://raw.githubusercontent.com/autosoft-dev/ml-on-code/main/assets/model.py
!wget https://raw.githubusercontent.com/autosoft-dev/ml-on-code/main/assets/utils.py

In [None]:
#import necessary(???) modules
import os
import json
import torch
import torch.nn as nn
from model import Seq2Seq
from utils import Example, convert_examples_to_features
from transformers import RobertaConfig, RobertaModel, RobertaTokenizer
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

In [None]:
#import models now
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, RobertaForMaskedLM, PreTrainedModel, PreTrainedTokenizer

teacher_tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
teacher_model = AutoModel.from_pretrained("microsoft/codebert-base")

student_tokenizer = AutoTokenizer.from_pretrained("huggingface/CodeBERTa-small-v1")
student_model = AutoModelForMaskedLM.from_pretrained("huggingface/CodeBERTa-small-v1")



In [None]:
from transformers.models.roberta.modeling_roberta import RobertaEncoder, RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel, RobertaConfig, RobertaModel, RobertaEncoder
from torch.nn import Module

def distill_roberta_weights(
    teacher : Module,
    student : Module,
) -> None:
    """
    Recursively copies the weights of the (teacher) to the (student).
    This function is meant to be first called on a RobertaFor... model, but is then called on every children of that model recursively.
    The only part that's not fully copied is the encoder, of which only half is copied.
    """
    # If the part is an entire RoBERTa model or a RobertaFor..., unpack and iterate
    if isinstance(teacher, RobertaModel) or type(teacher).__name__.startswith('RobertaFor'):
        for teacher_part, student_part in zip(teacher.children(), student.children()):
            distill_roberta_weights(teacher_part, student_part)
    # Else if the part is an encoder, copy one out of every layer
    elif isinstance(teacher, RobertaEncoder):
            teacher_encoding_layers = [layer for layer in next(teacher.children())]
            student_encoding_layers = [layer for layer in next(student.children())]
            for i in range(len(student_encoding_layers)):
                student_encoding_layers[i].load_state_dict(teacher_encoding_layers[2*i].state_dict())
    # Else the part is a head or something else, copy the state_dict
    else:
        student.load_state_dict(teacher.state_dict())

In [None]:
def distill_roberta(
    teacher_model : RobertaPreTrainedModel,
) -> RobertaPreTrainedModel:
    """
    Distilates a RoBERTa (teacher_model) like would DistilBERT for a BERT model.
    The student model has the same configuration, except for the number of hidden layers, which is // by 2.
    The student layers are initilized by copying one out of two layers of the teacher, starting with layer 0.
    The head of the teacher is also copied.
    """
    # Get teacher configuration as a dictionnary
    configuration = teacher_model.config.to_dict()
    # Half the number of hidden layer
    configuration['num_hidden_layers'] //= 2
    # Convert the dictionnary to the student configuration
    configuration = RobertaConfig.from_dict(configuration)
    # Create uninitialized student model
    student_model = type(teacher_model)(configuration)
    # Initialize the student's weights
    distill_roberta_weights(teacher=teacher_model, student=student_model)
    # Return the student model
    return student_model

In [None]:
#sanity check
teacher_model.config

RobertaConfig {
  "_name_or_path": "microsoft/codebert-base",
  "architectures": [
    "RobertaModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [None]:
#sanity check
student_model.config

RobertaConfig {
  "_name_or_path": "huggingface/CodeBERTa-small-v1",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}

In [None]:
my_student = distill_roberta(teacher_model)
my_student.config

RobertaConfig {
  "_name_or_path": "microsoft/codebert-base",
  "architectures": [
    "RobertaModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [None]:
# WHY DO I NEED TO DO THIS
!wget https://code-summary.s3.amazonaws.com/pytorch_model.bin

In [None]:
!wget https://code-summary.s3.amazonaws.com/pytorch_model.bin

In [None]:
# We are defining all the needed functions here. 
def inference(data, model, tokenizer):
    # Calculate bleu
    eval_sampler = SequentialSampler(data)
    eval_dataloader = DataLoader(data, sampler=eval_sampler, batch_size=len(data))

    model.eval()
    p = []
    for batch in eval_dataloader:
        batch = tuple(t.to('cpu') for t in batch)
        source_ids, source_mask = batch
        with torch.no_grad():
            preds = model(source_ids=source_ids, source_mask=source_mask)
            for pred in preds:
                t = pred[0].cpu().numpy()
                t = list(t)
                if 0 in t:
                    t = t[: t.index(0)]
                text = tokenizer.decode(t, clean_up_tokenization_spaces=False)
                p.append(text)
    return (p, source_ids.shape[-1])


def get_features(examples, tokenizer):
    features = convert_examples_to_features(
        examples, tokenizer, stage="test"
    )
    all_source_ids = torch.tensor(
        [f.source_ids[: 256] for f in features], dtype=torch.long
    )
    all_source_mask = torch.tensor(
        [f.source_mask[: 256] for f in features], dtype=torch.long
    )
    return TensorDataset(all_source_ids, all_source_mask)


def build_teacher_model(model_class, config, tokenizer):
    encoder = model_class(config=config)
    decoder_layer = nn.TransformerDecoderLayer(
        d_model=config.hidden_size, nhead=config.num_attention_heads
    )
    decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
    model = Seq2Seq(
        encoder=encoder,
        decoder=decoder,
        config=config,
        beam_size=10,
        max_length=128,
        sos_id=tokenizer.cls_token_id,
        eos_id=tokenizer.sep_token_id,
    )

    model.load_state_dict(
        torch.load(
            "pytorch_model.bin",
            map_location=torch.device("cpu"),
        ),
        strict=False,
    )
    return model

In [None]:
#build teacher model. WHY DO WE NEED TO DO THIS???
teacher = build_teacher_model(
    model_class=RobertaModel, config=teacher_model.config, tokenizer=teacher_tokenizer
).to('cpu')

In [None]:
#teacher check
example = [Example(source="def add_tensors(t, t1) -> Any:\n    return t + t1", target=None)]
message, length = inference(get_features(example, teacher_tokenizer), teacher, teacher_tokenizer)
print(message)

In [None]:
# try to build student model?
def build_student_model(model_class, config, tokenizer):
    encoder = model_class(config=config)
    decoder_layer = nn.TransformerDecoderLayer(
        d_model=config.hidden_size, nhead=config.num_attention_heads
    )
    decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
    model = Seq2Seq(
        encoder=encoder,
        decoder=decoder,
        config=config,
        beam_size=10,
        max_length=128,
        sos_id=tokenizer.cls_token_id,
        eos_id=tokenizer.sep_token_id,
    )

    model.load_state_dict(
        torch.load(
            "pytorch_model.bin",
            map_location=torch.device("cpu"),
        ),
        strict=False,
    )
    return model

In [None]:
#build student model. WHY DO WE NEED TO DO THIS???
student = build_student_model(
    model_class=RobertaModel, config=my_student.config, tokenizer=teacher_tokenizer
).to('cpu')

In [None]:
# For the ease of the tutorial we have created a small github example repo with a collection of files. 
# Some of it is coming from Open Source repos and some we created as example files.
!git clone https://github.com/autosoft-dev/example-files.git

In [None]:
# We are going to declare a small function that will help us go over each files in a nested directory tree 
# (like the one above we cloned) and get each file at a time.
from pathlib import Path

def check_out_path(target_path: Path):
    """"
    This function recursively yields all contents of a pathlib.Path object
    """
    yield target_path
    for file in target_path.iterdir():
        if file.is_dir():
            yield from check_out_path(file)
        else:
            yield file.absolute()


def is_python_file(file_path: Path):
  """
  This little function will help us to filter the result and keep only the python files
  """
  return file_path.is_file() and file_path.suffix == ".py"

In [None]:
# We are now ready to use tree-hugger to parse all the needed files and let's do that
# We first create our PythonParser object
from tree_hugger.core import PythonParser

In [None]:
pp = PythonParser(library_loc="/content/my-languages.so")

# Let's use the function we defined before to go over all the files.
for file_path in check_out_path(Path("example-files")):
  if is_python_file(file_path):
    # we use one line, super convinient tree-hugger API call to get the needed data
    if pp.parse_file(str(file_path)):
      temp_cache = []
      # The following call returns a dict where each key is a name of a function
      # And each value is a tuple, (function_body, function_docstring)
      func_and_docstr = pp.get_all_function_bodies(strip_docstr=True)
      for func_name, (body, docstr) in func_and_docstr.items():
        example = [Example(source=body, target=None)]
        message, length = inference(get_features(example, teacher_tokenizer), student, teacher_tokenizer)
        print(func_name, " ".join(message))
      # Let's add the result to the final output

  prevK = bestScoresId // numWords


add Returns a list of strings representing the inputted value .
check_even_numbers_in_a_list Returns a list of strings representing the inputted value .
open_file Returns a list of strings representing the current document .


KeyboardInterrupt: ignored