Merge pull request #9 from iamgroot42/michael/default_load_hf

Add OLMo Support
iamgroot42 · Feb 20, 2024 · 3bc8ff9 · 3bc8ff9
2 parents c413117 + 94aaa97
commit 3bc8ff9
Show file tree

Hide file tree

Showing 7 changed files with 74 additions and 15 deletions.
diff --git a/configs/cache_data.json b/configs/cache_data.json
@@ -1,17 +1,15 @@
 {
+    "experiment_name": "cache_data",
     "base_model": "EleutherAI/pythia-160m",
     "dataset_member": "the_pile",
-    "presampled_dataset_member": "/gscratch/h2lab/micdun/mimir/data/pile_subsets/pile_cc/train.jsonl",
     "dataset_nonmember": "the_pile",
-    "presampled_dataset_nonmember": "/gscratch/h2lab/micdun/mimir/data/pile_subsets/pile_cc/test.jsonl",
-    "dataset_key": "text",
     "min_words": 100,
     "max_words": 200,
     "max_tokens": 512,
     "output_name": "cache_data",
     "specific_source": "pile_cc",
     "n_samples": 1000,
-    "baselines_only": true,
+    "blackbox_attacks": [],
     "neighborhood_config": {
         "model": "bert",
         "n_perturbation_list": [
@@ -25,5 +23,6 @@
         "cache_dir": "cache_dir"
     },
     "dump_cache": true,
-    "load_from_cache": false
+    "load_from_cache": false,
+    "load_from_hf": false
 }
diff --git a/configs/olmo.json b/configs/olmo.json
@@ -0,0 +1,38 @@
+{
+    "experiment_name": "neo125_github_experiment",
+    "base_model": "EleutherAI/gpt-neo-125m",
+    "dataset_member": "the_pile",
+    "dataset_nonmember": "the_pile",
+    "min_words": 100,
+    "max_words": 200,
+    "max_tokens": 512,
+    "max_data": 100000,
+    "output_name": "unified_mia",
+    "specific_source": "Github",
+    "n_samples": 1000,
+    "blackbox_attacks": ["loss", "ref", "min_k", "zlib"],
+    "ref_config": {
+        "models": [
+            "stabilityai/stablelm-base-alpha-3b-v2"
+        ]
+    },
+    "neighborhood_config": {
+        "model": "bert",
+        "n_perturbation_list": [
+            25
+        ],
+        "pct_words_masked": 0.3,
+        "span_length": 2,
+        "dump_cache": false,
+        "load_from_cache": false,
+        "neighbor_strategy": "random"
+    },
+    "env_config": {
+        "results": "results_new",
+        "device": "cuda:0",
+        "device_aux": "cuda:0"
+    },
+    "dump_cache": false,
+    "load_from_cache": false,
+    "load_from_hf": true
+}
diff --git a/data/cache_data.sh b/data/cache_data.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 ngram=7
-for date in "arxiv_2020-08" #"arxiv_2021-01" "arxiv_2021-06" "arxiv_2022-06" "arxiv_2023-06" #"full_pile"
+for subset in "wikipedia" "s2" #date in "arxiv_2020-08"
 do
-    # echo caching data for $subset
+    echo caching data for $subset
     # python run.py \
     #     --config configs/cache_data.json \
     #     --presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/full_pile/full_pile_10000/train_raw.jsonl" \
@@ -12,10 +12,9 @@ do
     #     --n_samples 10000
     python run.py \
         --config configs/cache_data.json \
-        --presampled_dataset_member "/gscratch/h2lab/micdun/mimir/data/pile_subsets/arxiv/train_raw.jsonl" \
-        --presampled_dataset_nonmember "/mmfs1/gscratch/h2lab/micdun/mimir/data/temporal_arxiv/${date}/${date}/test_raw.jsonl" \
-        --specific_source $date \
-        --n_samples 1000
+        --presampled_dataset_member "/mmfs1/gscratch/h2lab/micdun/mimir/data/dolma/member/$subset/train_raw.jsonl" \
+        --presampled_dataset_nonmember "/mmfs1/gscratch/h2lab/micdun/mimir/data/dolma/nonmember/$subset/test_raw.jsonl" \
+        --specific_source dolma_$subset
 done
  #"/gscratch/h2lab/micdun/mimir/data/ngram_overlap_thresholded_pile_subsets/truncated+ngram_$ngram/0.0-0.2/$subset/test_raw.jsonl"
 

diff --git a/mimir/data_utils.py b/mimir/data_utils.py
@@ -231,7 +231,7 @@ def load(self, train: bool, mask_tokenizer=None, specific_source: str = None):
                         new_len = len(mask_tokenizer(x)["input_ids"])
                         assert new_len <= self.config.max_tokens
                         new_data.append(x)
-            data = new_data
+                data = new_data
 
             # print stats about remainining data
             print(f"Total number of samples: {len(data)}")

diff --git a/mimir/models.py b/mimir/models.py
@@ -14,6 +14,7 @@
 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import zlib
+from hf_olmo import *
 
 from mimir.config import ExperimentConfig
 from mimir.custom_datasets import SEPARATOR
@@ -142,10 +143,12 @@ def load_base_model_and_tokenizer(self, model_kwargs):
                 # llama is too big, gotta use device map
                 model = transformers.AutoModelForCausalLM.from_pretrained(self.name, **model_kwargs, device_map="balanced_low_0", cache_dir=self.cache_dir)
                 self.device = 'cuda:1'
-            elif "stablelm" in self.name:
-                # stablelm requires confirmation of running custom code
+            elif "stablelm" in self.name.lower():  # models requiring custom code
                 model = transformers.AutoModelForCausalLM.from_pretrained(
                     self.name, **model_kwargs, trust_remote_code=True, device_map=device_map, cache_dir=self.cache_dir)
+            elif "olmo" in self.name.lower():
+                model = transformers.AutoModelForCausalLM.from_pretrained(
+                    self.name, **model_kwargs, trust_remote_code=True, cache_dir=self.cache_dir)
             else:
                 model = transformers.AutoModelForCausalLM.from_pretrained(
                     self.name, **model_kwargs, device_map=device_map, cache_dir=self.cache_dir)

diff --git a/requirements.txt b/requirements.txt
@@ -14,4 +14,6 @@ simple-parsing
 nltk
 scipy>=1.11.2
 pytest
-huggingface_hub
+huggingface_hub
+ai2-olmo
+accelerate
diff --git a/scripts/run_mia_suite_olmo.sh b/scripts/run_mia_suite_olmo.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+version=unified_mia_v5_olmo_test_only
+
+for model in "OLMo-1B"
+do
+
+    for subset in "dolma_wikipedia" "dolma_s2" # "c4"
+    do
+        python3.9 run.py \
+            --experiment_name $version \
+            --config configs/olmo.json \
+            --base_model "allenai/$model" \
+            --specific_source ${subset} \
+            --load_from_cache true \
+            --load_from_hf false \
+            --n_samples 1000
+    done
+done