In [121]:
import os
import time
import datetime
import math
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from collections import defaultdict
import argparse
import logging

from sklearn.metrics import roc_auc_score, f1_score,average_precision_score
from sklearn.metrics import precision_recall_fscore_support 
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc as auc_score

import torch
print("torch version is {}".format(torch.__version__))
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.utils.data.sampler import SubsetRandomSampler

from datasets import load_dataset, load_metric, concatenate_datasets,DatasetDict,Dataset
from datasets import load_from_disk

import transformers
print("Transformers version is {}".format(transformers.__version__))

from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoModelWithLMHead,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    default_data_collator,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    get_linear_schedule_with_warmup,
    get_scheduler
)

from accelerate import Accelerator

import utils

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

def seed_everything(seed):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
os.environ['CUDA_VISIBLE_DEVICES'] ="3"

torch version is 1.9.1+cu111
Transformers version is 4.6.1


In [128]:
parser = argparse.ArgumentParser(description='BERT Model')
parser.add_argument('--gpus', type=int, default=[0,1], nargs='+', help='used gpu')
parser.add_argument("--shuffle_train",  type=bool,default=True,help="shuffle data or not")
parser.add_argument("--validation_split",  type=float,default=0.2,help="The split ratio for validation dataset")
parser.add_argument("--loss_weight",  type=bool,default=False,help="weight for unbalance data")
parser.add_argument("--train_negative_positive_ratio",  type=int,default=4,help="Undersampling negative vs position ratio in training")
parser.add_argument("--test_negative_positive_ratio",  type=int,default=10,help="Undersampling negative vs position ratio in test set")
parser.add_argument("--seed",  type=int,default=101,
        help="random seed for np.random.seed, torch.manual_seed and torch.cuda.manual_seed.")

parser.add_argument("--trucation_strategy", type=str, default="tail",help="how to truncate the long length email")
parser.add_argument("--batch_size", type=int, default=2)
parser.add_argument('--num_epochs', type=int, default=10)
parser.add_argument("--gradient_accumulation_steps",type=int,default=8,
                           help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument('--lr', type=float, default=2e-5, help="learning rate")
parser.add_argument('--lr_scheduler_type', type=str, default="linear")
#     parser.add_argument('--lr_scheduler_type', type=str, default="cosine")
parser.add_argument("--fp16", action="store_true", help="If passed, will use FP16 training.")
parser.add_argument('--use_schedule',  type=bool,default=True)
parser.add_argument("--weight_decay", default=1e-4, type=float, help="Weight decay if we apply some.")
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
parser.add_argument("--warmup_ratio", default=0.4, type=float, help="Linear warmup over warmup_steps.")
parser.add_argument('--model_checkpoint', type=str, default="bert-base-uncased")
parser.add_argument("--output_dir", default=os.path.join(os.getcwd(),"bert_repo"), type=str, help="output folder name")
parser.add_argument("--model_output_name", default="bert", type=str)
parser.add_argument("--feature_name", default="Full_TextBody", type=str)

args,_ = parser.parse_known_args()

args.model_output_name=f'{args.model_output_name}_{args.feature_name}_output'
args.output_dir=f'{args.output_dir}_{args.feature_name}'

seed_everything(args.seed)

print()
print(args)
print()

data_dir=os.path.join(os.getcwd(),"dataset","email_all")
email_all=load_from_disk(data_dir)
email_all=email_all.filter(lambda x: x[args.feature_name]!=None)

email_all=email_all.map(lambda x: tokenizer(x[args.feature_name]),batched=True)

max_seq_length=tokenizer.model_max_length
def truncation_text(example):
    truncated_input_ids=tokenizer(example[args.feature_name],truncation=True,padding=False,return_tensors="pt",add_special_tokens=False)['input_ids']
    
    if args.trucation_strategy=="tail":
        truncated_input_ids=truncated_input_ids[:,-(max_seq_length - 2):].squeeze()
    elif args.trucation_strategy=="head":
        truncated_input_ids=truncated_input_ids[:,0:(max_seq_length - 2)].squeeze()
    elif args.trucation_strategy=="mixed":
        truncated_input_ids=truncated_input_ids[:(max_seq_length - 2) // 2] + truncated_input_ids[-((max_seq_length - 2) // 2):]
        truncated_input_ids=truncated_input_ids.squeeze()
    else:
        raise NotImplemented("Unknown truncation. Supported truncation: tail, head, mixed truncation")
        
    return {"truncated_text":tokenizer.decode(truncated_input_ids)}

email_all=email_all.map(truncation_text)
columns=email_all['train'].column_names
columns_to_keep=['truncated_text','churn']
columns_to_remove=set(columns)-set(columns_to_keep)
email_all=email_all.remove_columns(columns_to_remove)
email_all.rename_column("truncated_text", args.feature_name)

# train_data=email_all['train']
# test_data=email_all['test']
train_data=email_all['train'].shuffle(seed=101).select(range(1200))
test_data=email_all['test'].shuffle(seed=101).select(range(500))

# os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(str(x) for x in args.gpus)
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

# print(f"The number of GPUs is {torch.cuda.device_count()}")
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print()
    print('{:<30}{:<10}'.format("The # of availabe GPU(s): ",torch.cuda.device_count()))

    for i in range(torch.cuda.device_count()):
        print('{:<30}{:<10}'.format("GPU Name: ",torch.cuda.get_device_name(i)))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


# main(args,train_data, test_data)

tokenizer=AutoTokenizer.from_pretrained(args.model_checkpoint)
model=AutoModelForSequenceClassification.from_pretrained(args.model_checkpoint)

print()
print(f"The maximal # input tokens : {tokenizer.model_max_length:,}")
print(f"Vocabulary size : {tokenizer.vocab_size:,}")
print(f"The # of parameters : {sum([p.nelement() for p in model.parameters()]):,}")
print()


Namespace(adam_epsilon=1e-08, batch_size=2, feature_name='Full_TextBody', fp16=False, gpus=[0, 1], gradient_accumulation_steps=8, loss_weight=False, lr=2e-05, lr_scheduler_type='linear', model_checkpoint='bert-base-uncased', model_output_name='bert_Full_TextBody_output', num_epochs=10, output_dir='/home/ec2-user/SageMaker/trident/src/bert_repo_Full_TextBody', seed=101, shuffle_train=True, test_negative_positive_ratio=10, train_negative_positive_ratio=4, trucation_strategy='tail truncation', use_schedule=True, validation_split=0.2, warmup_ratio=0.4, weight_decay=0.0001)



  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/99270 [00:00<?, ?ex/s]

  0%|          | 0/27300 [00:00<?, ?ex/s]


The # of availabe GPU(s):     2         
GPU Name:                     NVIDIA A10G
GPU Name:                     NVIDIA A10G


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


The maximal # input tokens : 512
Vocabulary size : 30,522
The # of parameters : 109,483,778



In [133]:
email_all

DatasetDict({
    train: Dataset({
        features: ['churn', 'Full_TextBody'],
        num_rows: 99270
    })
    test: Dataset({
        features: ['churn', 'Full_TextBody'],
        num_rows: 27300
    })
})

In [138]:
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

# print(f"The number of GPUs is {torch.cuda.device_count()}")
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print()
    print('{:<30}{:<10}'.format("The # of availabe GPU(s): ",torch.cuda.device_count()))

    for i in range(torch.cuda.device_count()):
        print('{:<30}{:<10}'.format("GPU Name: ",torch.cuda.get_device_name(i)))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


# main(args,train_data, test_data)

tokenizer=AutoTokenizer.from_pretrained(args.model_checkpoint)
model=AutoModelForSequenceClassification.from_pretrained(args.model_checkpoint)

print()
print(f"The maximal # input tokens : {tokenizer.model_max_length:,}")
print(f"Vocabulary size : {tokenizer.vocab_size:,}")
print(f"The # of parameters : {sum([p.nelement() for p in model.parameters()]):,}")
print()


The # of availabe GPU(s):     2         
GPU Name:                     NVIDIA A10G
GPU Name:                     NVIDIA A10G


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


The maximal # input tokens : 512
Vocabulary size : 30,522
The # of parameters : 109,483,778



In [136]:
tokenizer=AutoTokenizer.from_pretrained(args.model_checkpoint)
model=AutoModelForSequenceClassification.from_pretrained(args.model_checkpoint)

print()
print(f"The maximal # input tokens : {tokenizer.model_max_length:,}")
print(f"Vocabulary size : {tokenizer.vocab_size:,}")
print(f"The # of parameters : {sum([p.nelement() for p in model.parameters()]):,}")
print()

train_module=utils.Loader_Creation(train_data, tokenizer,args.feature_name)


test_module=utils.Loader_Creation(test_data, tokenizer,args.feature_name)

train_data.set_format(type="pandas")
df_train=train_data[:]
train_data.reset_format()


train_dataloader=DataLoader(train_module,
                            shuffle=True,
                            batch_size=args.batch_size,
                            collate_fn=train_module.collate_fn,
                            drop_last=True   # longformer model bug
                           )


test_dataloader=DataLoader(test_module,
                            shuffle=False,
                            batch_size=args.batch_size,
                            collate_fn=test_module.collate_fn
                           )

print()
print('{:<30}{:<10,} '.format("training mini-batch",len(train_dataloader)))
#     print('{:<30}{:<10,} '.format("validation mini-batch",len(valid_dataloader)))
print('{:<30}{:<10,} '.format("test mini-batch",len(test_dataloader)))

train_label=df_train['churn'].values.squeeze()
num_classes=np.unique(train_label).shape[0]
if args.loss_weight:
    train_classes_num, train_classes_weight = utils.get_class_count_and_weight(train_label,num_classes)
    loss_weight=torch.tensor(train_classes_weight).to(device)
else:
    loss_weight=None


t_total = int((len(train_dataloader) // args.batch_size)//args.gradient_accumulation_steps*float(args.num_epochs))

warmup_steps=int((len(train_dataloader) // args.batch_size)//args.gradient_accumulation_steps*args.warmup_ratio)

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": args.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]

optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=args.adam_epsilon)
# optimizer=AdamW(model.parameters(),lr=args.lr)
#     lr_scheduler =get_linear_schedule_with_warmup(optimizer, 
#                                                   num_warmup_steps=warmup_steps, 
#                                                   num_training_steps=t_total
#                                                  )

lr_scheduler = get_scheduler(name=args.lr_scheduler_type, 
                             optimizer=optimizer,
                             num_warmup_steps=warmup_steps,
                             num_training_steps=t_total)

accelerator = Accelerator(fp16=args.fp16)
acc_state = {str(k): str(v) for k, v in accelerator.state.__dict__.items()}
if accelerator.is_main_process:
    accelerator.print("")
    logger.info(f'Accelerator Config: {acc_state}')
    accelerator.print("")

#     model, optimizer, train_dataloader, valid_dataloader, test_dataloader = accelerator.prepare(
#         model, optimizer, train_dataloader, valid_dataloader, test_dataloader
#     )

model, optimizer, train_dataloader, test_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, test_dataloader
)

best_metric = float('inf')
# best_metric = 0

iter_tput = []

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


The maximal # input tokens : 512
Vocabulary size : 30,522
The # of parameters : 109,483,778



  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

INFO:__main__:Accelerator Config: {'fork_launched': 'False', 'backend': 'None', 'deepspeed_plugin': 'None', 'distributed_type': 'DistributedType.NO', 'num_processes': '1', 'process_index': '0', 'local_process_index': '0', 'device': 'cuda', 'mixed_precision': 'no', 'initialized': 'True'}



training mini-batch           600        
test mini-batch               250        




RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [16]:
s=["Hello world, this is chuanliang working in Mobi, I joined mobi on july 5th.  So far so go df aed", "today is a nice day, I like a dog,sr"]
tokenizer(s,truncation=True,padding="max_length",return_tensors="pt")['input_ids'].shape

torch.Size([2, 512])

In [98]:
max_seq_length=6
tokenizer.decode(tokenizer(s[1],truncation=True,padding=False,return_tensors="pt",add_special_tokens=False)['input_ids'][:,-(max_seq_length - 2):].squeeze())

'a dog, sr'

In [95]:
tokenizer(s[0],truncation=True,padding=False,return_tensors="pt",add_special_tokens=False)['input_ids'][:,-(max_seq_length - 2):].squeeze()

tensor([ 1040,  2546, 29347,  2094])

In [21]:
email_all

DatasetDict({
    train: Dataset({
        features: ['Full_TextBody', 'Client_TextBody', 'Latest_TextBody', 'year', 'churn'],
        num_rows: 91550
    })
    test: Dataset({
        features: ['Full_TextBody', 'Client_TextBody', 'Latest_TextBody', 'year', 'churn'],
        num_rows: 25450
    })
})

In [104]:
train_data=email_all['train'].shuffle(seed=101).select(range(1200))
test_data=email_all['test'].shuffle(seed=101).select(range(500))
tempt=DatasetDict({"train":train_data, "test":test_data})
tempt



DatasetDict({
    train: Dataset({
        features: ['Full_TextBody', 'Client_TextBody', 'Latest_TextBody', 'year', 'churn'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['Full_TextBody', 'Client_TextBody', 'Latest_TextBody', 'year', 'churn'],
        num_rows: 500
    })
})

In [105]:
tempt=tempt.map(lambda x: tokenizer(x['Full_TextBody']),batched=True)
def compute_lenth(example):
    return {"text_length":len(example["input_ids"])}
tempt=tempt.map(compute_lenth)
np.max(tempt["train"]['text_length']), np.max(tempt["test"]['text_length'])




  0%|          | 0/1 [00:00<?, ?ba/s]



  0%|          | 0/500 [00:00<?, ?ex/s]

(29449, 10982)

In [107]:
max_seq_length=tokenizer.model_max_length
def truncation_lenth(example):
    return {"truncated_text":tokenizer.decode(tokenizer(example['Full_TextBody'],truncation=True,padding=False,return_tensors="pt",add_special_tokens=False)['input_ids'][:,-(max_seq_length - 2):].squeeze())}
tempt2=tempt.map(truncation_lenth)

  0%|          | 0/1200 [00:00<?, ?ex/s]

  0%|          | 0/500 [00:00<?, ?ex/s]

In [110]:
tempt2

DatasetDict({
    train: Dataset({
        features: ['Full_TextBody', 'Client_TextBody', 'Latest_TextBody', 'year', 'churn', 'input_ids', 'token_type_ids', 'attention_mask', 'text_length', 'truncated_text'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['Full_TextBody', 'Client_TextBody', 'Latest_TextBody', 'year', 'churn', 'input_ids', 'token_type_ids', 'attention_mask', 'text_length', 'truncated_text'],
        num_rows: 500
    })
})

In [114]:
columns=tempt2['train'].column_names
columns_to_keep=['truncated_text','churn']
columns_to_remove=set(columns)-set(columns_to_keep)
tempt2=tempt2.remove_columns(columns_to_remove)
tempt2

DatasetDict({
    train: Dataset({
        features: ['churn', 'truncated_text'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['churn', 'truncated_text'],
        num_rows: 500
    })
})

In [None]:
tempt2

In [117]:
tempt2=tempt2.map(lambda x: tokenizer(x['truncated_text']),batched=True)
def compute_lenth(example):
    return {"text_length":len(example["input_ids"])}
tempt2=tempt2.map(compute_lenth)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1200 [00:00<?, ?ex/s]

  0%|          | 0/500 [00:00<?, ?ex/s]

In [65]:
max_seq_length=tokenizer.model_max_length

tempt.set_format("pandas")
df_train=tempt["train"][:]
df_test=tempt["test"][:]

use_cols=["truncated_text","churn"]
df_train["truncated_text"]=df_train['Full_TextBody'].apply(lambda x: " ".join(tokenizer.tokenize(x)[-(max_seq_length - 2):]))
df_train=df_train.loc[:,use_cols]
df_test["truncated_text"]=df_test['Full_TextBody'].apply(lambda x: " ".join(tokenizer.tokenize(x)[-(max_seq_length - 2):]))
df_test=df_test.loc[:,use_cols]

df_train=Dataset.from_pandas(df_train)
df_test=Dataset.from_pandas(df_test)
tempt2=DatasetDict({"train":df_train, "test":df_test})
tempt2

DatasetDict({
    train: Dataset({
        features: ['truncated_text', 'churn'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['truncated_text', 'churn'],
        num_rows: 500
    })
})

In [66]:
tempt2=tempt2.map(lambda x: tokenizer(x['truncated_text']),batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [67]:
def compute_lenth(example):
    return {"text_length":len(example["input_ids"])}
tempt2=tempt2.map(compute_lenth)

  0%|          | 0/1200 [00:00<?, ?ex/s]

  0%|          | 0/500 [00:00<?, ?ex/s]

In [68]:
tempt2

DatasetDict({
    train: Dataset({
        features: ['truncated_text', 'churn', 'input_ids', 'token_type_ids', 'attention_mask', 'text_length'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['truncated_text', 'churn', 'input_ids', 'token_type_ids', 'attention_mask', 'text_length'],
        num_rows: 500
    })
})

In [118]:
np.max(tempt2["train"]['text_length']), np.max(tempt2["test"]['text_length'])

(516, 517)

In [30]:
# max_seq_length=tokenizer.model_max_length
# def truncation_lenth(example):
#     return {"truncated_text":" ".join(tokenizer.tokenize(x['Full_TextBody'])[-(max_seq_length - 2):])}
# tempt2=tempt.map(truncation_lenth)

In [31]:
# max_seq_length=tokenizer.model_max_length
# tempt2=tempt.map(lambda x: " ".join(tokenizer.tokenize(x['Full_TextBody'])[-(max_seq_length - 2):]),batched=True)

In [69]:
np.max(tempt2["train"]['text_length']), np.max(tempt2["test"]['text_length'])

(802, 789)

In [None]:
class Loader_Creation(Dataset):
    def __init__(self,
                 dataset,
                 tokenizer,
                 feature_name
                ):
        super().__init__()
        self.dataset=dataset
        self.tokenizer=tokenizer
        
        self.dataset=self.dataset.map(lambda x:tokenizer(x[feature_name],truncation=True,padding="max_length"), 
                                      batched=True)
        self.dataset.set_format(type="pandas")
        self.dataset=self.dataset[:]
    
    def __len__(self):
        return self.dataset.shape[0]
    
    def __getitem__(self,index):
        _ids = self.dataset.loc[index]["input_ids"].squeeze()
        _mask = self.dataset.loc[index]["attention_mask"].squeeze()
        _target = self.dataset.loc[index]["churn"].squeeze()
        
        return dict(
            input_ids=_ids,
            attention_mask=_mask,
            labels=_target
        )
    
    def collate_fn(self,batch):
        input_ids=torch.stack([torch.tensor(x["input_ids"]) for x in batch])
        attention_mask=torch.stack([torch.tensor(x["attention_mask"]) for x in batch])
        labels=torch.stack([torch.tensor(x["labels"]) for x in batch])
        
        pad_token_id=self.tokenizer.pad_token_id
        keep_mask = input_ids.ne(pad_token_id).any(dim=0)
        
        input_ids=input_ids[:, keep_mask]
        attention_mask=attention_mask[:, keep_mask]
        
        return dict(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )

In [None]:


train_module=utils.Loader_Creation(train_data, tokenizer,args.feature_name)


test_module=utils.Loader_Creation(test_data, tokenizer,args.feature_name)

train_data.set_format(type="pandas")
df_train=train_data[:]
train_data.reset_format()

#     train_indices, val_indices=utils.mask_creation(df_train, 'churn', args.seed, args.validation_split)



#     train_sampler = SubsetRandomSampler(train_indices)
#     valid_sampler = SubsetRandomSampler(val_indices)

train_dataloader=DataLoader(train_module,
                            shuffle=True,
                            batch_size=args.batch_size,
                            collate_fn=train_module.collate_fn,
                            drop_last=True   # longformer model bug
                           )

#     train_dataloader=DataLoader(train_module,
#                                 sampler=train_sampler,
#                                 batch_size=args.batch_size,
#                                 collate_fn=train_module.collate_fn,
#                                 drop_last=True   # longformer model bug
#                                )

#     valid_dataloader=DataLoader(train_module,
#                                 sampler=valid_sampler,
#                                 batch_size=args.batch_size,
#                                 collate_fn=train_module.collate_fn
#                                )

test_dataloader=DataLoader(test_module,
                            shuffle=False,
                            batch_size=args.batch_size,
                            collate_fn=test_module.collate_fn
                           )

# %pdb
# next(iter(train_dataloader))

print()
print('{:<30}{:<10,} '.format("training mini-batch",len(train_dataloader)))
#     print('{:<30}{:<10,} '.format("validation mini-batch",len(valid_dataloader)))
print('{:<30}{:<10,} '.format("test mini-batch",len(test_dataloader)))

train_label=df_train['churn'].values.squeeze()
num_classes=np.unique(train_label).shape[0]
if args.loss_weight:
    train_classes_num, train_classes_weight = utils.get_class_count_and_weight(train_label,num_classes)
    loss_weight=torch.tensor(train_classes_weight).to(device)
else:
    loss_weight=None


t_total = int((len(train_dataloader) // args.batch_size)//args.gradient_accumulation_steps*float(args.num_epochs))

warmup_steps=int((len(train_dataloader) // args.batch_size)//args.gradient_accumulation_steps*args.warmup_ratio)

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": args.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]

optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=args.adam_epsilon)
# optimizer=AdamW(model.parameters(),lr=args.lr)
#     lr_scheduler =get_linear_schedule_with_warmup(optimizer, 
#                                                   num_warmup_steps=warmup_steps, 
#                                                   num_training_steps=t_total
#                                                  )

lr_scheduler = get_scheduler(name=args.lr_scheduler_type, 
                             optimizer=optimizer,
                             num_warmup_steps=warmup_steps,
                             num_training_steps=t_total)

accelerator = Accelerator(fp16=args.fp16)
acc_state = {str(k): str(v) for k, v in accelerator.state.__dict__.items()}
if accelerator.is_main_process:
    accelerator.print("")
    logger.info(f'Accelerator Config: {acc_state}')
    accelerator.print("")

#     model, optimizer, train_dataloader, valid_dataloader, test_dataloader = accelerator.prepare(
#         model, optimizer, train_dataloader, valid_dataloader, test_dataloader
#     )

model, optimizer, train_dataloader, test_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, test_dataloader
)

best_metric = float('inf')
# best_metric = 0

iter_tput = []

In [None]:
model.train()

losses=[]
for step,batch in enumerate(train_dataloader):
    t0=time.time()
    batch={k:v.to(accelerator.device) for k,v in batch.items()}
    outputs=model(**batch)
#             loss=outputs.loss
    logits=outputs.logits

    if loss_weight is None:
        loss = F.cross_entropy(logits.view(-1, num_classes).to(accelerator.device), 
                               batch["labels"])
    else:
        loss = F.cross_entropy(logits.view(-1, num_classes).to(accelerator.device), 
                               batch["labels"], weight=loss_weight.float().to(accelerator.device)) 
        
    if step==2:
        break

In [None]:
step

In [None]:
outputs

In [None]:
logits=outputs.logits
logits

In [None]:
outputs.view(-1, num_classes)

In [None]:
from accelerate.utils.dataclasses import SageMakerDistributedType

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()

In [None]:
!pip install "accelerate[sagemaker]" --upgrade

In [None]:
# !accelerate config 

In [None]:
import sagemaker
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

In [None]:
role