In [1]:
import torch, os, re, pandas as pd, json
from sklearn.model_selection import train_test_split
from transformers import DataCollatorForLanguageModeling, DataCollatorWithPadding, GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, AutoConfig
from datasets import Dataset
import numpy as np
import random
import csv

# Initialize seeder and randomness
seed = 123
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)

if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu"  
device = torch.device(dev)  
print(device)

cuda:0


In [2]:
model_name_or_path = 'gpt2' # 'microsoft/CodeGPT-small-py' 'gpt2' './model_logs_fromScratch'
base_tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path, do_lower_case = True)
base_model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
base_model = base_model.to(device)

base_model.num_parameters
# (wte): Embedding(50262, 768)
#     (wpe): Embedding(1024, 768)


<bound method ModuleUtilsMixin.num_parameters of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0

In [3]:
print('Words in vocabulary: ', base_tokenizer.vocab_size)
vocabulary = base_tokenizer.get_vocab()
print(vocabulary['for'])

example_text = "for i in range(0, 10):"
print(base_tokenizer.tokenize(example_text))

text_ids = base_tokenizer.encode(example_text, return_tensors = 'pt')
print(text_ids)



Words in vocabulary:  50257
1640
['for', 'Ġi', 'Ġin', 'Ġrange', '(', '0', ',', 'Ġ10', '):']
tensor([[1640, 1312,  287, 2837,    7,   15,   11,  838, 2599]])


In [4]:
def generate_n_text_samples(model, tokenizer, input_text, device, n_samples = 5):
    text_ids = tokenizer.encode(input_text, return_tensors = 'pt')
    text_ids = text_ids.to(device)
    model = model.to(device)

    generated_text_samples = model.generate(
        text_ids, 
        max_length= 100,  
        num_return_sequences= n_samples,
        no_repeat_ngram_size= 2,
        repetition_penalty= 1.5,
        top_p= 0.92,
        temperature= .85,
        do_sample= True,
        top_k= 125,
        early_stopping= True
    )
    gen_text = []
    for t in generated_text_samples:
        text = tokenizer.decode(t, skip_special_tokens=True)
        gen_text.append(text)

    return gen_text


In [5]:
# text generation example
generated_text_samples = generate_n_text_samples(base_model, base_tokenizer, example_text, device)

generated_text_samples



['for i in range(0, 10): numid$ strid$$numid fx else # un',
 'for i in range(0, 10): strid$ # we have a list of value return self _create',
 'for i in range(0, 10): numid$ dtype',
 'for i in range(0, 10):23456789abc',
 'for i in range(0, 10): numid$ return int num']

In [6]:
# the eos and bos tokens are defined
bos = '<|endoftext|>'
eos = '<|EOS|>'
pad = '<|pad|>'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad}

# the new token is added to the tokenizer
num_added_toks = base_tokenizer.add_special_tokens(special_tokens_dict)

# the model config to which we add the special tokens
config = AutoConfig.from_pretrained(model_name_or_path, 
                                    bos_token_id=base_tokenizer.bos_token_id,
                                    eos_token_id=base_tokenizer.eos_token_id,
                                    pad_token_id=base_tokenizer.pad_token_id,
                                    output_hidden_states=False)

# the pre-trained model is loaded with the custom configuration
base_model = GPT2LMHeadModel.from_pretrained(model_name_or_path, config=config)

# Clear the pre-trained weights of the model for from scratch training
base_model.init_weights()

# the model embedding is resized
base_model.resize_token_embeddings(len(base_tokenizer))

base_model = base_model.to(device)

In [7]:
'''max_length = 100

filepath= 'articles1.csv'
df = pd.read_csv(filepath, encoding = 'utf-8', usecols=['title', 'publication'])\
                    .rename(columns={'title': 'text'})


pd.set_option("display.max_colwidth", None)
df.head(5)

def remove_publication_headline(headline, publication):
    # publication col doesn't match exactly with newspaper in title col
    if str(publication) in str(headline):
        headline = headline.split(' - ')[0]
    return headline

def process_headlines(df, text_colname):
  
    # Remove empty and null rows
    titulo_vacio = (df['text'].str.len() == 0) | df['text'].isna()
    df = df[~titulo_vacio]

    # Remove publication name from title
    df['text'] = df.apply(lambda row: remove_publication_headline(row['text'], row['publication']), axis = 1)

    # Remove headlines with less than 8 words
    titlos_len_ge8 = (df['text'].str.split().apply(lambda x: len(x)) >= 8)
    df = df[titlos_len_ge8]

    # Drop duplicates
    text_df = df.drop_duplicates(subset = [text_colname])\
                [[text_colname]]

    return text_df
    
df = process_headlines(df, 'text')
df'''

'max_length = 100\n\nfilepath= \'articles1.csv\'\ndf = pd.read_csv(filepath, encoding = \'utf-8\', usecols=[\'title\', \'publication\'])                    .rename(columns={\'title\': \'text\'})\n\n\npd.set_option("display.max_colwidth", None)\ndf.head(5)\n\ndef remove_publication_headline(headline, publication):\n    # publication col doesn\'t match exactly with newspaper in title col\n    if str(publication) in str(headline):\n        headline = headline.split(\' - \')[0]\n    return headline\n\ndef process_headlines(df, text_colname):\n  \n    # Remove empty and null rows\n    titulo_vacio = (df[\'text\'].str.len() == 0) | df[\'text\'].isna()\n    df = df[~titulo_vacio]\n\n    # Remove publication name from title\n    df[\'text\'] = df.apply(lambda row: remove_publication_headline(row[\'text\'], row[\'publication\']), axis = 1)\n\n    # Remove headlines with less than 8 words\n    titlos_len_ge8 = (df[\'text\'].str.split().apply(lambda x: len(x)) >= 8)\n    df = df[titlos_len_ge8]\n

In [7]:
max_length = 120

def dropEmpty(tokens0):
    tokens = []
    for i in range(0, len(tokens0)):
        temp = tokens0[i]
        if temp != []:
            tokens.append(temp)
    return tokens

with open("pretraining_corpus.csv", newline='', encoding='utf-8') as f:
        reader = csv.reader(x.replace('\0', '') for x in f)
        #reader = csv.reader(f)
        data = list(reader)
data = dropEmpty(data)
#limit = int(len(data)/2)
#data = data[0:limit]
# data = data[0:1000] # sub sample for checking

# Creation of the dataset's structure
text = []
for item in data:
    text.append(' '.join([str(token) for token in item[0:]]))

# Convert to pandas
df = pd.DataFrame({'text': text})
df

Unnamed: 0,text
0,strid$strid$strid$ # need to do something here...
1,has_f_libraries self return self distribution ...
2,show_compilers from distutils ccompiler import...
3,get_headers directory_list # get h files from ...
4,get_directories list_of_sources # get unique d...
...,...
768549,to_xml self strid$strid$strid$ s strid$ encryp...
768550,ault_kms_key_name s append strid$
768551,ault_kms_key_name self
768552,ault_kms_key_name


In [8]:
len(df.iloc[21,0])

57

In [9]:
df['text'] = bos + ' ' + df['text'] + ' ' + eos

df_train, df_val = train_test_split(df, train_size = 0.9, random_state = seed)
print(f'There are {len(df_train)} components for training and {len(df_val)} for validation')
df_train

There are 691698 components for training and 76856 for validation


Unnamed: 0,text
673677,<|endoftext|> aultstrid$numid$strid$values_see...
713366,<|endoftext|> aultstrid$datetime datetime nows...
315574,<|endoftext|> check_equal self obj kwargs a ha...
683547,<|endoftext|> aultstrid$falsestrid$is_superuse...
200974,<|endoftext|> execute_cli self kwargs strid$st...
...,...
194278,<|endoftext|> ault numid$ type strid$ full_pol...
192476,<|endoftext|> main argument_spec openstack_ful...
17730,<|endoftext|> test_hermeone self assert_equal ...
28030,<|endoftext|> testexpiration self self cache s...


In [10]:
# we load the datasets directly from a pandas df
train_dataset = Dataset.from_pandas(df_train[['text']])
val_dataset = Dataset.from_pandas(df_val[['text']])
train_dataset

Dataset({
    features: ['text', '__index_level_0__'],
    num_rows: 691698
})

In [11]:
train_dataset["text"]

['<|endoftext|> aultstrid$numid$strid$values_seenstrid$sentry db models fields bounded boundedpositiveintegerfieldstrid$ <|EOS|>',
 '<|endoftext|> aultstrid$datetime datetime nowstrid$environment_idstrid$sentry db models fields bounded boundedpositiveintegerfieldstrid$nullstrid$truestrid$idstrid$sentry db models fields bounded boundedbigautofieldstrid$primary_keystrid$truestrid$labelstrid$django db models fields charfieldstrid$max_lengthstrid$numid$strid$projectstrid$sentry db models fields foreignkey flexibleforeignkeystrid$tostrid$statusstrid$sentry db models fields bounded boundedpositiveintegerfieldstrid$ <|EOS|>',
 '<|endoftext|> check_equal self obj kwargs a hash_pandas_object obj kwargs b hash_pandas_object obj kwargs tm assert_series_equal a b kwargs pop strid$ none a hash_pandas_object obj kwargs b hash_pandas_object obj kwargs tm assert_series_equal a b <|EOS|>',
 '<|endoftext|> aultstrid$falsestrid$is_superuserstrid$django db models fields booleanfieldstrid$ <|EOS|>',
 '<|en

In [12]:
def tokenize_function(examples):
        return base_tokenizer(examples['text'], padding=True, max_length=max_length, truncation=True)

#base_tokenizer.padding_side = "left"
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1,
    remove_columns=['text'],
)
tokenized_val_dataset = val_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1,
    remove_columns=['text'],
)

# Example of the result of the tokenization process with padding
base_tokenizer.decode(tokenized_train_dataset['input_ids'][0])




  0%|          | 0/692 [00:00<?, ?ba/s]

  0%|          | 0/77 [00:00<?, ?ba/s]

'<|endoftext|> aultstrid$numid$strid$values_seenstrid$sentry db models fields bounded boundedpositiveintegerfieldstrid$ <|EOS|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|>'

In [13]:
model_logs = './model_logs_fromScratch'

BATCH_SIZE = 32 #16
EPOCHS = 5

training_args = TrainingArguments(
    output_dir=model_logs,          # output directory
    num_train_epochs=EPOCHS,              # total # of training epochs
    per_device_train_batch_size=BATCH_SIZE,  # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,   # batch size for evaluation
    warmup_steps=200,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=model_logs,            # directory for storing logs
    prediction_loss_only=True,
    save_steps=5000 
)

data_collator = DataCollatorForLanguageModeling(
        tokenizer=base_tokenizer,
        mlm=False
    )

trainer = Trainer(
    model=base_model,                         # the instantiated  Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,         # training dataset
    eval_dataset=tokenized_val_dataset            # evaluation dataset
)


In [14]:
trainer.train()

trainer.save_model()
base_tokenizer.save_pretrained(model_logs)

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 691698
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 43232
  Number of trainable parameters = 124441344


Step,Training Loss
500,2.0414
1000,2.0853
1500,2.0541
2000,2.0417
2500,2.0258
3000,1.9949
3500,1.9565
4000,1.9537
4500,1.9314
5000,1.9343


Saving model checkpoint to ./model_logs_fromScratch_v02\checkpoint-5000
Configuration saved in ./model_logs_fromScratch_v02\checkpoint-5000\config.json
Configuration saved in ./model_logs_fromScratch_v02\checkpoint-5000\generation_config.json
Model weights saved in ./model_logs_fromScratch_v02\checkpoint-5000\pytorch_model.bin
Saving model checkpoint to ./model_logs_fromScratch_v02\checkpoint-10000
Configuration saved in ./model_logs_fromScratch_v02\checkpoint-10000\config.json
Configuration saved in ./model_logs_fromScratch_v02\checkpoint-10000\generation_config.json
Model weights saved in ./model_logs_fromScratch_v02\checkpoint-10000\pytorch_model.bin
Saving model checkpoint to ./model_logs_fromScratch_v02\checkpoint-15000
Configuration saved in ./model_logs_fromScratch_v02\checkpoint-15000\config.json
Configuration saved in ./model_logs_fromScratch_v02\checkpoint-15000\generation_config.json
Model weights saved in ./model_logs_fromScratch_v02\checkpoint-15000\pytorch_model.bin
Savin

KeyboardInterrupt: 

In [None]:
trainer.evaluate()

In [None]:
# trained model loading

'''pre_model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
pre_tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)

#device = "cuda:0"

input_text = pre_tokenizer.bos_token

source_code = generate_n_text_samples(pre_model, pre_tokenizer, 
                                    input_text, device, n_samples = 10)
for h in source_code:
    print(h)
    print()
'''

In [None]:
# trained model loading
model = GPT2LMHeadModel.from_pretrained(model_logs)
tokenizer = GPT2Tokenizer.from_pretrained(model_logs)

#device = "cuda:0"

input_text = tokenizer.bos_token

source_code = generate_n_text_samples(model, tokenizer, 
                                    input_text, device, n_samples = 10)
for h in source_code:
    print(h)
    print()


In [None]:
# model_name_or_path = 'microsoft/CodeGPT-small-py'
# base_tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path, do_lower_case = True)
# base_model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
# base_model = base_model.to(device)
# tokenizer = GPT2Tokenizer.from_pretrained(model_logs)

# #base_model.init_weights()

# input_text = base_tokenizer.bos_token

# source_code = generate_n_text_samples(base_model, base_tokenizer, 
#                                     input_text, device, n_samples = 10)
# for h in source_code:
#     print(h)
#     print()


In [None]:
# model_name_or_path = 'microsoft/CodeGPT-small-py' # 'model_logs_fromScratch' # './model_logs' # 'microsoft/CodeGPT-small-py' #'gpt2'
# base_tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path, do_lower_case = True)
# base_model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
# base_model = base_model.to(device)
# tokenizer = GPT2Tokenizer.from_pretrained(model_logs)

# base_model.init_weights()

# input_text = base_tokenizer.bos_token

# source_code = generate_n_text_samples(base_model, base_tokenizer, 
#                                     input_text, device, n_samples = 10)
# for h in source_code:
#     print(h)
#     print()


In [None]:
vocabulary = base_tokenizer.get_vocab()
print(vocabulary['for'])