Skip to content

Commit

Permalink
Merge pull request #9 from iamgroot42/michael/default_load_hf
Browse files Browse the repository at this point in the history
Add OLMo Support
  • Loading branch information
iamgroot42 committed Feb 20, 2024
2 parents c413117 + 94aaa97 commit 3bc8ff9
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 15 deletions.
9 changes: 4 additions & 5 deletions configs/cache_data.json
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
{
"experiment_name": "cache_data",
"base_model": "EleutherAI/pythia-160m",
"dataset_member": "the_pile",
"presampled_dataset_member": "/gscratch/h2lab/micdun/mimir/data/pile_subsets/pile_cc/train.jsonl",
"dataset_nonmember": "the_pile",
"presampled_dataset_nonmember": "/gscratch/h2lab/micdun/mimir/data/pile_subsets/pile_cc/test.jsonl",
"dataset_key": "text",
"min_words": 100,
"max_words": 200,
"max_tokens": 512,
"output_name": "cache_data",
"specific_source": "pile_cc",
"n_samples": 1000,
"baselines_only": true,
"blackbox_attacks": [],
"neighborhood_config": {
"model": "bert",
"n_perturbation_list": [
Expand All @@ -25,5 +23,6 @@
"cache_dir": "cache_dir"
},
"dump_cache": true,
"load_from_cache": false
"load_from_cache": false,
"load_from_hf": false
}
38 changes: 38 additions & 0 deletions configs/olmo.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"experiment_name": "neo125_github_experiment",
"base_model": "EleutherAI/gpt-neo-125m",
"dataset_member": "the_pile",
"dataset_nonmember": "the_pile",
"min_words": 100,
"max_words": 200,
"max_tokens": 512,
"max_data": 100000,
"output_name": "unified_mia",
"specific_source": "Github",
"n_samples": 1000,
"blackbox_attacks": ["loss", "ref", "min_k", "zlib"],
"ref_config": {
"models": [
"stabilityai/stablelm-base-alpha-3b-v2"
]
},
"neighborhood_config": {
"model": "bert",
"n_perturbation_list": [
25
],
"pct_words_masked": 0.3,
"span_length": 2,
"dump_cache": false,
"load_from_cache": false,
"neighbor_strategy": "random"
},
"env_config": {
"results": "results_new",
"device": "cuda:0",
"device_aux": "cuda:0"
},
"dump_cache": false,
"load_from_cache": false,
"load_from_hf": true
}
11 changes: 5 additions & 6 deletions data/cache_data.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/bin/bash
ngram=7
for date in "arxiv_2020-08" #"arxiv_2021-01" "arxiv_2021-06" "arxiv_2022-06" "arxiv_2023-06" #"full_pile"
for subset in "wikipedia" "s2" #date in "arxiv_2020-08"
do
# echo caching data for $subset
echo caching data for $subset
# python run.py \
# --config configs/cache_data.json \
# --presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/full_pile/full_pile_10000/train_raw.jsonl" \
Expand All @@ -12,10 +12,9 @@ do
# --n_samples 10000
python run.py \
--config configs/cache_data.json \
--presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/pile_subsets/arxiv/train_raw.jsonl" \
--presampled_dataset_nonmember "/mmfs1/gscratch/h2lab/micdun/mimir/data/temporal_arxiv/${date}/${date}/test_raw.jsonl" \
--specific_source $date \
--n_samples 1000
--presampled_dataset_member "/mmfs1/gscratch/h2lab/micdun/mimir/data/dolma/member/$subset/train_raw.jsonl" \
--presampled_dataset_nonmember "/mmfs1/gscratch/h2lab/micdun/mimir/data/dolma/nonmember/$subset/test_raw.jsonl" \
--specific_source dolma_$subset
done
#"/gscratch/h2lab/micdun/mimir/data/ngram_overlap_thresholded_pile_subsets/truncated+ngram_$ngram/0.0-0.2/$subset/test_raw.jsonl"

Expand Down
2 changes: 1 addition & 1 deletion mimir/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def load(self, train: bool, mask_tokenizer=None, specific_source: str = None):
new_len = len(mask_tokenizer(x)["input_ids"])
assert new_len <= self.config.max_tokens
new_data.append(x)
data = new_data
data = new_data

# print stats about remainining data
print(f"Total number of samples: {len(data)}")
Expand Down
7 changes: 5 additions & 2 deletions mimir/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import zlib
from hf_olmo import *

from mimir.config import ExperimentConfig
from mimir.custom_datasets import SEPARATOR
Expand Down Expand Up @@ -142,10 +143,12 @@ def load_base_model_and_tokenizer(self, model_kwargs):
# llama is too big, gotta use device map
model = transformers.AutoModelForCausalLM.from_pretrained(self.name, **model_kwargs, device_map="balanced_low_0", cache_dir=self.cache_dir)
self.device = 'cuda:1'
elif "stablelm" in self.name:
# stablelm requires confirmation of running custom code
elif "stablelm" in self.name.lower(): # models requiring custom code
model = transformers.AutoModelForCausalLM.from_pretrained(
self.name, **model_kwargs, trust_remote_code=True, device_map=device_map, cache_dir=self.cache_dir)
elif "olmo" in self.name.lower():
model = transformers.AutoModelForCausalLM.from_pretrained(
self.name, **model_kwargs, trust_remote_code=True, cache_dir=self.cache_dir)
else:
model = transformers.AutoModelForCausalLM.from_pretrained(
self.name, **model_kwargs, device_map=device_map, cache_dir=self.cache_dir)
Expand Down
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,6 @@ simple-parsing
nltk
scipy>=1.11.2
pytest
huggingface_hub
huggingface_hub
ai2-olmo
accelerate
18 changes: 18 additions & 0 deletions scripts/run_mia_suite_olmo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash
version=unified_mia_v5_olmo_test_only

for model in "OLMo-1B"
do

for subset in "dolma_wikipedia" "dolma_s2" # "c4"
do
python3.9 run.py \
--experiment_name $version \
--config configs/olmo.json \
--base_model "allenai/$model" \
--specific_source ${subset} \
--load_from_cache true \
--load_from_hf false \
--n_samples 1000
done
done

0 comments on commit 3bc8ff9

Please sign in to comment.