# package installation & load data


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go

In [None]:
df = pd.read_csv('/content/drive/My Drive/coding/data/Methdos_portfolio_data_trim.csv', encoding='windows-1252')

# Data cleaning

In [None]:
df[['Cycle Number', 'Year']] = df['Cycle'].str.split(' ', expand=True)[[1, 2]] #split cycle and year

In [None]:
df[['First name', 'Last name']] = df['Project Owner'].str.split(' ', expand=True)[[0, 1]] #split first name and last name

In [None]:
df[['Primary area', 'Secondary area']] = df['Programmatic Priority'].str.split(';', expand=True)[[0, 1]]

In [None]:
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

In [None]:
df.rename(columns={'Short Project Title': 'Project Title'}, inplace=True)

In [None]:
selected_columns = ['Project Title', 'Technical Abstract', 'Methods used']

for col_name in selected_columns:
    df[col_name] = df[col_name].apply(lambda x: f"{col_name}: {x}")

In [None]:
# Combining content from selected columns into a new column 'Project Content'
df['Project Content'] = df.apply(lambda row: ' '.join(str(row[col]) for col in selected_columns), axis=1)

# Apply keyBert/KeyLLM for keyword extraction

In [None]:
!pip install keybert
!pip install transformers
!pip install bitsandbytes
!pip install accelerate

Collecting keybert
  Downloading keybert-0.8.3.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.3.8 (from keybert)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers>=0.3.8->keybert)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: keybert, sentence-transformers
  Building wheel for keybert (setup.py) ... [?25l[?25hdone
  Created wheel for keybert: filename=keybert-0.8.3-py3-none-any.whl size=39126 sha256=ae4dd07e0430a56d1de810ccd01e0a83e063f3ae47e71b7afe3543f5b27ff014
  Stored in direct

In [None]:
import bitsandbytes
from torch import cuda, bfloat16
import transformers
from huggingface_hub.hf_api import HfFolder
import accelerate

from keybert.llm import TextGeneration
from keybert import KeyLLM, KeyBERT

In [None]:
# 4-bit Quantization to load Llama 2 with less GPU memory
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

In [None]:
model_id = 'meta-llama/Llama-2-7b-chat-hf'
HfFolder.save_token("hf_xxxx")

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)
model.eval()

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm

In [None]:
# Our text generator
generator = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)

In [None]:
prompt = """
<s>[INST] <<SYS>>

You are a helpful assistant specialized in extracting comma-separated keywords.
Please identify important topics, for example clinical condition, research objective, analytical methods used, research gap addressed from
the description of a research proposal. Please read and comprehend the description, and identify suitable terms to annotate that proposal, so
that by looking at just the topical terms, reader can get a general sense of what this proposal is about.

<</SYS>>
I have the following project description:
Project Title: [input project title here].
[input project abstract here]
Our Specific Aims are to:[input specific aims here].
Methods used: [input methods used here]

Please give me the keywords that are present in this document and separate them with commas.
The keywords can be more than one word. For example, don't say:
"Here are the keywords present in the document"
[/INST] observational comparative effectiveness research, longitudinal phenotypes extraction, real-world data, electronic health records, confounder ascertainment, patient-reported outcomes, machine learning,
generalizability, natural language processing, propensity scores, cancer research [INST]

I have the following document:
- [DOCUMENT]

Please give me the keywords that are present in this document and separate them with commas.
Make sure you to only return the keywords and say nothing else. For example, don't say:
"Here are the keywords present in the document"
[/INST]
"""

# Load it in KeyLLM
llm = TextGeneration(generator, prompt=prompt)
kw_model = KeyLLM(llm)
#kw_model = KeyBERT(llm=llm)

In [None]:
# let's look at one example output
keywords = kw_model.extract_keywords(text_51,check_vocab=True)
keywords

[['patient linkage',
  'de-identified linkage',
  'probabilistic linkage',
  'key-based deterministic linkage',
  'frequency-based weight scaling',
  'set-based similarity measures',
  'dependency relationships',
  'accommodations for missing data',
  'adding more linkage fields',
  'standardizing data',
  'improving data quality',
  'sensitivity',
  'specificity',
  'PPV',
  'AUC',
  'latent class model',
  'marginal logistic regression']]

In [None]:
extracted_keywords = []

for index, row in df.iterrows():
    # Extract keywords for each 'Project Content' value using kw_model.extract_keywords
    keywords = kw_model.extract_keywords(row['Project Content'], check_vocab=True)

    # Append extracted keywords to the list
    extracted_keywords.append(keywords)

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


# Test KeyBERT

In [None]:
extracted_keywords_1 = []
kw_model_2 = KeyBERT(llm=llm)

for index, row in subset_df.iterrows():
    # Extract keywords for each 'Project Content' value using kw_model.extract_keywords
    keywords = kw_model_2.extract_keywords(row['Project Content'], keyphrase_ngram_range=(1, 2), stop_words=None)

    # Append extracted keywords to the list
    extracted_keywords_1.append(keywords)

In [None]:
subset_df['Extracted Keywords_KeyLLM'] = extracted_keywords_1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df['Extracted Keywords_KeyLLM'] = extracted_keywords_1
