<a href="https://colab.research.google.com/github/jared-ni/6.8610-project/blob/main/new_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install datasets
!pip install spacy
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz
!pip install googletrans==4.0.0-rc1
!pip install deep-translator
!pip install transformers
!pip install torch

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz (14.8 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [3]:
from datasets import load_dataset
import pandas as pd
import spacy
from deep_translator import GoogleTranslator
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Flags for which LLMs to use
USE_LLAMA = True
USE_MISTRAL = False
USE_FALCON = False

# Define max_length multiplier for LLM prompts
MAX_LENGTH_MULTIPLIER = 2

In [4]:
# Load datasets into pandas DataFrames
def load_law_dataset():
    ds = load_dataset("casehold/casehold", "all")
    train_df = pd.DataFrame(ds['train'])
    test_df = pd.DataFrame(ds['test'])
    validation_df = pd.DataFrame(ds['validation'])
    law_dataset = pd.concat([train_df, test_df, validation_df], ignore_index=True)['citing_prompt']
    return law_dataset

def load_medical_dataset():
    ds = load_dataset("zhengyun21/PMC-Patients")
    train_df = pd.DataFrame(ds['train'])
    medical_dataset = train_df['patient']
    return medical_dataset

# Combine datasets
def load_all_datasets():
    law_dataset = load_law_dataset()
    medical_dataset = load_medical_dataset()
    return [law_dataset, medical_dataset]


In [5]:
# Load SpaCy model
def load_spacy_model(model_path='en_core_sci_sm'):
    return spacy.load(model_path)

# Extract entities from text
def extract_entities(nlp, text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents]

# Translate entities to a target language
def translate_entities(entities, target_lang):
    translations = [GoogleTranslator(source='auto', target=target_lang).translate(entity) for entity in entities]
    return translations


In [9]:
from huggingface_hub import login
llama_token = "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".strip()
login(llama_token)

# Load Llama model
def load_llama_model():
    llama_model_name = "meta-llama/Llama-2-7b-chat-hf"
    llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_name)
    llama_model = AutoModelForCausalLM.from_pretrained(llama_model_name, torch_dtype="auto", device_map="auto")
    return llama_tokenizer, llama_model

# Generate text with Llama
def llama_generate_text(tokenizer, model, prompt, max_length):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        eos_token_id=tokenizer.eos_token_id,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        do_sample=True
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Add similar functions for Mistral and Falcon if flags are enabled


In [10]:
def run_pipeline():
    # Load datasets
    datasets = load_all_datasets()

    # Load NLP model
    nlp = load_spacy_model()

    # Load LLMs
    llama_tokenizer, llama_model = load_llama_model() if USE_LLAMA else (None, None)

    for dataset in datasets:
        for i, text in enumerate(dataset[:10]):  # Iterate through the first 10 entries for testing
            print(f"\nProcessing Entry {i+1}")
            print(f"Original Text: {text}")

            # Extract entities
            entities = extract_entities(nlp, text)
            print("Extracted Entities:", entities)

            # Translate entities
            chinese_translations = translate_entities(entities, 'zh-CN')
            french_translations = translate_entities(entities, 'fr')
            print("Chinese Translations:", chinese_translations)
            print("French Translations:", french_translations)

            # Generate text with Llama
            if USE_LLAMA:
                prompt = f"Translate the following entities to Chinese: {entities}"
                max_length = len(prompt) * MAX_LENGTH_MULTIPLIER
                llama_response = llama_generate_text(llama_tokenizer, llama_model, prompt, max_length)
                print("Llama Response:", llama_response)

            # Add similar blocks for Mistral and Falcon if needed

            print("-" * 40)


In [11]:
run_pipeline()



tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]


Processing Entry 1
Original Text: Drapeau’s cohorts, the cohort would be a “victim” of making the bomb. Further, firebombs are inherently dangerous. There is no peaceful purpose for making a bomb. Felony offenses that involve explosives qualify as “violent crimes” for purposes of enhancing the sentences of career offenders. See 18 U.S.C. § 924(e)(2)(B)(ii) (defining a “violent felony” as: “any crime punishable by imprisonment for a term exceeding one year ... that ... involves use of explosives”). Courts have found possession of a'bomb to be a crime of violence based on the lack of a nonviolent purpose for a bomb and the fact that, by its very nature, there is a substantial risk that the bomb would be used against the person or property of another. See United States v. Newman, 125 F.3d 863 (10th Cir.1997) (unpublished) (<HOLDING>); United States v. Dodge, 846 F.Supp. 181,
Extracted Entities: ['Drapeau’s cohorts', 'cohort', 'victim', 'bomb', 'firebombs', 'bomb', 'Felony', 'violent crim

KeyboardInterrupt: 