In [1]:
from streaming import StreamingDataset
from streaming import StreamingDataLoader
from transformers import AutoTokenizer
import numpy as np
import torch

Inspired by the [MosaicML's Fast Resumption Tutorial](https://docs.mosaicml.com/projects/streaming/en/latest/distributed_training/fast_resumption.html)

In [2]:
data_local_path="/fsx/yuli/cache/data/out_gojek"
data_subset="replay-exp5-10eng-10code-80sealang"
batching_method = 'random'
batch_size = 512
tokenizer_path = "/fsx/yuli/cache/model/llama3_tokenizer"
global_seed = 17

# insert the step which you want to resume the training at
step_to_check = 102 

# this is meant to resume the loading a few steps before the step we want to inspect
# e.g step_to_check = 61 &  resumption_offset_steps = 2, we will save the training state at step 61-2= 59
resumption_offset_steps = 2


tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

dataset = StreamingDataset(
    local=data_local_path, 
    split=data_subset,
    batch_size=batch_size,
    shuffle=True,
    shuffle_seed=global_seed,
    batching_method=batching_method
)
dataloader = StreamingDataLoader(
    dataset, 
    batch_size=batch_size,
)


state_dict = None
decoded_sentences = []
for step, batch in enumerate(dataloader):
    if step % 50 == 0:
        print(f"step completed {step}")
    if step == step_to_check - 1 - resumption_offset_steps:
        state_dict = dataloader.state_dict()
        
    if step == step_to_check - 1:
        for input_seq in batch['tokens']:
            ids = np.frombuffer(input_seq, dtype=np.int64).copy()
            decoded_input = tokenizer.decode(ids)
            decoded_sentences.append(decoded_input)
            state_dict_check = dataloader.state_dict()
        break

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Because `predownload` was not specified, it will default to 8*batch_size if batch_size is not None, otherwise 64. Prior to Streaming v0.7.0, `predownload` defaulted to max(batch_size, 256 * batch_size // num_canonical_nodes).
Because `num_canonical_nodes` was not specified, and `shuffle_algo` is py1e, it will default to be equal to physical nodes. Prior to Streaming v0.7.0, `num_canonical_nodes` defaulted to 64 * physical nodes.
Because `shuffle_block_size` was not specified, it will default to max(4_000_000 // num_canonical_nodes, 1 << 18) if num_canonical_nodes is not None, otherwise 262144. Prior to Streaming v0.7.0, `shuffle_block_size` defaulted to 262144.


step completed 0
step completed 50
step completed 100


In [3]:
# Here it's very important that we reinitialize the StreamingDataset with same parameters
# but under a different variable name. Otherwise it will not be deterministic

dataset2 = StreamingDataset(
    local=data_local_path, 
    split=data_subset,
    batch_size=batch_size,
    shuffle=True,
    shuffle_seed=global_seed,
    batching_method=batching_method
)

dataloader2 = StreamingDataLoader(
    dataset2, 
    batch_size=batch_size,
)

dataloader2.load_state_dict(state_dict)
for step, batch in enumerate(dataloader2):
    if step == resumption_offset_steps - 1:
        decoded_sentences2 = []
        for input_seq in batch['tokens']:
            ids = np.frombuffer(input_seq, dtype=np.int64).copy()
            decoded_input = tokenizer.decode(ids)
            decoded_sentences2.append(decoded_input)
        break  

Because `predownload` was not specified, it will default to 8*batch_size if batch_size is not None, otherwise 64. Prior to Streaming v0.7.0, `predownload` defaulted to max(batch_size, 256 * batch_size // num_canonical_nodes).
Because `shuffle_block_size` was not specified, it will default to max(4_000_000 // num_canonical_nodes, 1 << 18) if num_canonical_nodes is not None, otherwise 262144. Prior to Streaming v0.7.0, `shuffle_block_size` defaulted to 262144.


In [4]:
# verify that this is equal to true to confirm deterministic

decoded_sentences2 == decoded_sentences

True

In [5]:
# verify that this is equal to true to confirm deterministic

dataloader2.state_dict() == dataloader.state_dict()

True

In [8]:
dataloader2.state_dict()

{'epoch': 0,
 'sample_in_epoch': 52224,
 'num_canonical_nodes': 1,
 'shuffle_seed': 17,
 'initial_physical_nodes': 1}

In [6]:
print(decoded_sentences2[0])

['multi_option_number'])?$data['multi_option_number']:""
  ,array(
     'class'=>"extra_small_input numeric_only"
   ));
  ?>
   </div>
   
   <?php $multi_data= isset($data['multi_id'])?json_decode($data['multi_id']):false;?>
   <?php $multi_opt=yii::app()->functions->getMultiOptionList();?>     
   <?php
   if (is_array($multi_opt) && count($multi_opt)>=1){
   	  foreach ($multi_opt as $multi_id=>$val_multi) { 
   	  	$chk=false;   	  	
   	  	if (in_array($multi_id,(array)$multi_data)){   	  		
   	  		$chk=true;
   	  	}
   	  	echo "<li>";
   	  	echo CHtml::checkBox('multi_id[]',$chk,array('value'=>$multi_id))."<span>$val_multi</span>";
   	  	echo "</li>";
   	  }
   }
  ?>
   </div> <!--MULTI OPTIONS-->

   
</div><!-- END RIGHT-->
<div class="clear"></div>


<?php if (isset($_GET['id'])):?>
<input type="submit" value="<?php echo Yii::t("default","Update")?>" class="uk-button uk-button-success" >
<?php else:?>
<input type="submit" value="<?php echo Yii::t("default","Submit")?>"

In [7]:
print(decoded_sentences[0])

['multi_option_number'])?$data['multi_option_number']:""
  ,array(
     'class'=>"extra_small_input numeric_only"
   ));
  ?>
   </div>
   
   <?php $multi_data= isset($data['multi_id'])?json_decode($data['multi_id']):false;?>
   <?php $multi_opt=yii::app()->functions->getMultiOptionList();?>     
   <?php
   if (is_array($multi_opt) && count($multi_opt)>=1){
   	  foreach ($multi_opt as $multi_id=>$val_multi) { 
   	  	$chk=false;   	  	
   	  	if (in_array($multi_id,(array)$multi_data)){   	  		
   	  		$chk=true;
   	  	}
   	  	echo "<li>";
   	  	echo CHtml::checkBox('multi_id[]',$chk,array('value'=>$multi_id))."<span>$val_multi</span>";
   	  	echo "</li>";
   	  }
   }
  ?>
   </div> <!--MULTI OPTIONS-->

   
</div><!-- END RIGHT-->
<div class="clear"></div>


<?php if (isset($_GET['id'])):?>
<input type="submit" value="<?php echo Yii::t("default","Update")?>" class="uk-button uk-button-success" >
<?php else:?>
<input type="submit" value="<?php echo Yii::t("default","Submit")?>"