In [28]:
# from huggingface_hub import hf_hub_download 
import torch
from datasets import load_dataset, load_from_disk, Dataset
import evaluate
import transformers
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
from transformers import MistralForCausalLM
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, TaskType, get_peft_model
from peft import PeftConfig, PeftModel
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from huggingface_hub import notebook_login
from accelerate import init_empty_weights, load_checkpoint_and_dispatch, \
                        infer_auto_device_map, dispatch_model, load_checkpoint_in_model

import pandas as pd
import sqlite3
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import re
import os
import glob

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import string

In [2]:
try:
    conn = sqlite3.connect("../data/PeaTMOSS.db")    
except Exception as e:
    print(e)

In [3]:
#Now in order to read in pandas dataframe we need to know table name
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
table_name_list = cursor.fetchall()
table_name = []
for name in table_name_list:
    table_name.append(name[0])

In [4]:
query = """
WITH model_tb AS (
    SELECT * FROM model 
    WHERE LOWER(repo_url) LIKE '%huggingface%'
),

paper_tb AS (
    SELECT mtp.model_id, p.title AS paper_title
    FROM model_to_paper mtp
    LEFT JOIN paper p
    ON mtp.paper_id = p.id    
),

model_task_tb AS (
    SELECT mtmt.model_id, mt.name AS model_task
    FROM model_to_model_task mtmt
    LEFT JOIN model_task mt
    ON mtmt.model_task_id = mt.id
)

SELECT m.context_id, m.downloads, 
lb.value AS limitation, 
d.title AS discussion_title,
hp.value AS hyperparam, 
em.test AS test_metric, em.result AS test_result,
p.paper_title,
mt.model_task
FROM model_tb m
LEFT JOIN limitation_and_bias lb
ON m.id = lb.model_id
LEFT JOIN discussion d
ON m.id = d.model_id
LEFT JOIN hyper_parameters hp
ON m.id = hp.model_id
LEFT JOIN evaluation_metric em
ON m.id = em.model_id
LEFT JOIN paper_tb p
ON m.id = p.model_id
LEFT JOIN model_task_tb mt
ON m.id = mt.model_id
"""
df_model_discussion = pd.read_sql_query(query, conn)
df_model_discussion = df_model_discussion.iloc[6:]

In [5]:
pd.read_sql_query("SELECT * FROM evaluation_metric LIMIT 100", conn)

Unnamed: 0,id,test,dataset_id,result,model_id
0,1,SQUAD 1.1 F1/EM,10875,91.0/84.3,118085
1,2,Multi NLI Accuracy,10876,86.05,118085
2,3,Average,10877,82.3,105564
3,4,SQuAD1.1,6493,90.2/83.2,105564
4,5,SQuAD2.0,7473,82.1/79.3,105564
...,...,...,...,...,...
95,96,recall,10906,0.946809,37006
96,97,f1,10906,0.941799,37006
97,98,precision,10907,0.959818,37006
98,99,recall,10907,0.957278,37006


In [6]:
print(transformers.__version__)
print(torch.__version__)
print(torch.cuda.is_available())

4.35.2
2.3.0+cu118
True


In [7]:
! nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:41:10_Pacific_Daylight_Time_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [32]:
llm_repo_dir = 'D:/projects/LLM'
cache_dir = '/cygdrive/d/projects/LLM/.cache'
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['HF_HOME'] = cache_dir + '/huggingface'
os.environ['XDG_CACHE_HOME'] = cache_dir
os.environ['HF_DATASETS_CACHE'] = cache_dir

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [33]:
# Mistral 7B
model_name = 'mistralai/Mistral-7B-v0.1'
model_path = cache_dir + '/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24'
offload_folder = cache_dir + '/models--mistralai--Mistral-7B-v0.1/offload_folder'

In [10]:
%%time

mistral_tokenizer = AutoTokenizer.from_pretrained(model_path, 
                                          cache_dir=cache_dir, 
                                          local_flies_only=True,
                                          padding_side="left")
mistral_tokenizer.pad_token = mistral_tokenizer.eos_token 

# load empty model to save memory
with init_empty_weights():
    mistral_model = AutoModelForCausalLM.from_pretrained(model_path, 
                                             cache_dir=cache_dir,
                                             local_files_only=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



CPU times: total: 5.28 s
Wall time: 5.95 s




In [11]:
# 0: cuda. cpu: RAM
device_map = infer_auto_device_map(
    mistral_model, 
    max_memory={0: "6GB", 'cpu': "30GB"},
    no_split_module_classes=["OPTDecoderLayer"], 
    dtype='float16'
)

device_map



OrderedDict([('model.embed_tokens', 0),
             ('model.layers.0', 0),
             ('model.layers.1', 0),
             ('model.layers.2', 0),
             ('model.layers.3', 0),
             ('model.layers.4', 0),
             ('model.layers.5', 0),
             ('model.layers.6', 0),
             ('model.layers.7', 0),
             ('model.layers.8', 0),
             ('model.layers.9', 0),
             ('model.layers.10', 0),
             ('model.layers.11', 0),
             ('model.layers.12.self_attn.q_proj', 0),
             ('model.layers.12.self_attn.k_proj', 'cpu'),
             ('model.layers.12.self_attn.v_proj', 'cpu'),
             ('model.layers.12.self_attn.o_proj', 'cpu'),
             ('model.layers.12.self_attn.rotary_emb', 'cpu'),
             ('model.layers.12.mlp', 'cpu'),
             ('model.layers.12.input_layernorm', 'cpu'),
             ('model.layers.12.post_attention_layernorm', 'cpu'),
             ('model.layers.13', 'cpu'),
             ('model.layers

In [12]:
mistral_model = load_checkpoint_and_dispatch(mistral_model, 
                         model_path,
                         device_map=device_map,
                         dtype='float16',
                         offload_folder=offload_folder,
                         offload_state_dict=True)

  0%|          | 0/203 [00:00<?, ?w/s]

  0%|          | 0/88 [00:00<?, ?w/s]



In [17]:
df_model_limitation = df_model_discussion[~df_model_discussion['limitation'].isnull()]\
                        [['context_id','limitation','model_task']].drop_duplicates()
df_model_limitation

Unnamed: 0,context_id,limitation,model_task
76,0Tick/e621TagAutocomplete,Since DistilGPT2 is a distilled version of GPT...,text-generation
174,0xDEADBEA7/DialoGPT-small-rick,The model may reflect biases present in the da...,conversational
175,0xDEADBEA7/DialoGPT-small-rick,The model may reflect biases present in the da...,text-generation
267,1-800-BAD-CODE/punctuation_fullstop_truecase_e...,"This model was trained on news data, and may n...",text2text-generation
269,1-800-BAD-CODE/sentence_boundary_detection_mul...,"This model was trained on `OpenSubtitles`, dat...",token-classification
...,...,...,...
323742,zayedupal/movie-genre-prediction_distilbert-ba...,More information needed,text-classification
323901,zekun-li/geolm-base-toponym-recognition,Significant research has explored bias and fai...,token-classification
323991,zenham/khemx_m_e4_16h,The model may inherit biases from the training...,conversational
323992,zenham/khemx_m_e4_16h,The model may inherit biases from the training...,text-generation


In [18]:
df_model_limitation['limitation'].iloc[0]

'Since DistilGPT2 is a distilled version of GPT-2, it is intended to be used for similar use cases with the increased functionality of being smaller and easier to run than the base model. The developers of GPT-2 state in their model card that they envisioned GPT-2 would be used by researchers to better understand large-scale generative language models, with possible secondary use cases including writing assistance, creative writing and art, and entertainment. However, because large-scale language models like GPT-2 do not distinguish fact from fiction, they are not recommended for use-cases that require the generated text to be true. Additionally, language models like GPT-2 reflect the biases inherent to the systems they were trained on, so they should not be deployed into systems that interact with humans unless the deployers first carry out a study of biases relevant to the intended use-case.'

In [19]:
comment_id = 0
task = "I want to use a prompt of 10000 words"

prompt = """
Will the following description cause an issue if {0}?
Description: {1} The model can only take input less than 11000 tokens.
Response (Yes/No/insufficient information):
""".format(task.strip(), df_model_limitation['limitation'].iloc[comment_id].strip())

print(prompt)


Will the following description cause an issue if I want to use a prompt of 10000 words?
Description: Since DistilGPT2 is a distilled version of GPT-2, it is intended to be used for similar use cases with the increased functionality of being smaller and easier to run than the base model. The developers of GPT-2 state in their model card that they envisioned GPT-2 would be used by researchers to better understand large-scale generative language models, with possible secondary use cases including writing assistance, creative writing and art, and entertainment. However, because large-scale language models like GPT-2 do not distinguish fact from fiction, they are not recommended for use-cases that require the generated text to be true. Additionally, language models like GPT-2 reflect the biases inherent to the systems they were trained on, so they should not be deployed into systems that interact with humans unless the deployers first carry out a study of biases relevant to the intended us

In [20]:
%%time

input_ids = mistral_tokenizer([prompt], return_tensors="pt").input_ids.to(device)
with torch.no_grad():
    generated_ids = mistral_model.generate(input_ids, 
                                      do_sample=True,
                                      temperature=0.1,
                                      top_k=3,
                                      top_p=3,
                                      max_new_tokens=2)
    generated_text = mistral_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Will the following description cause an issue if I want to use a prompt of 10000 words?
Description: Since DistilGPT2 is a distilled version of GPT-2, it is intended to be used for similar use cases with the increased functionality of being smaller and easier to run than the base model. The developers of GPT-2 state in their model card that they envisioned GPT-2 would be used by researchers to better understand large-scale generative language models, with possible secondary use cases including writing assistance, creative writing and art, and entertainment. However, because large-scale language models like GPT-2 do not distinguish fact from fiction, they are not recommended for use-cases that require the generated text to be true. Additionally, language models like GPT-2 reflect the biases inherent to the systems they were trained on, so they should not be deployed into systems that interact with humans unless the deployers first carry out a study of biases relevant to the intended us

In [21]:
comment_id = 0
task = "I want to use a prompt of 10000 words"

prompt = """
Will the following description cause an issue if {0}?
Description: {1} The model can only take input less than 8000 tokens.
Response (Yes/No/insufficient information):
""".format(task.strip(), df_model_limitation['limitation'].iloc[comment_id].strip())

print(prompt)


Will the following description cause an issue if I want to use a prompt of 10000 words?
Description: Since DistilGPT2 is a distilled version of GPT-2, it is intended to be used for similar use cases with the increased functionality of being smaller and easier to run than the base model. The developers of GPT-2 state in their model card that they envisioned GPT-2 would be used by researchers to better understand large-scale generative language models, with possible secondary use cases including writing assistance, creative writing and art, and entertainment. However, because large-scale language models like GPT-2 do not distinguish fact from fiction, they are not recommended for use-cases that require the generated text to be true. Additionally, language models like GPT-2 reflect the biases inherent to the systems they were trained on, so they should not be deployed into systems that interact with humans unless the deployers first carry out a study of biases relevant to the intended us

In [22]:
%%time

input_ids = mistral_tokenizer([prompt], return_tensors="pt").input_ids.to(device)
with torch.no_grad():
    generated_ids = mistral_model.generate(input_ids, 
                                      do_sample=True,
                                      temperature=0.1,
                                      top_k=3,
                                      top_p=3,
                                      max_new_tokens=2)
    generated_text = mistral_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Will the following description cause an issue if I want to use a prompt of 10000 words?
Description: Since DistilGPT2 is a distilled version of GPT-2, it is intended to be used for similar use cases with the increased functionality of being smaller and easier to run than the base model. The developers of GPT-2 state in their model card that they envisioned GPT-2 would be used by researchers to better understand large-scale generative language models, with possible secondary use cases including writing assistance, creative writing and art, and entertainment. However, because large-scale language models like GPT-2 do not distinguish fact from fiction, they are not recommended for use-cases that require the generated text to be true. Additionally, language models like GPT-2 reflect the biases inherent to the systems they were trained on, so they should not be deployed into systems that interact with humans unless the deployers first carry out a study of biases relevant to the intended us

In [44]:
df_model_discussion[~df_model_discussion['test_result'].isnull()]\
[['context_id','test_metric','test_result']].drop_duplicates().iloc[:50]

Unnamed: 0,context_id,test_metric,test_result
12,0-hero/led-large-legal-summary,Gen Len,27.633
13,0-hero/led-large-legal-summary,Loss,2.098
14,0-hero/led-large-legal-summary,Rouge1,36.855
15,0-hero/led-large-legal-summary,Rouge2,22.050
16,0-hero/led-large-legal-summary,RougeL,33.547
17,0-hero/led-large-legal-summary,RougeLsum,34.607
63,09panesara/distilbert-base-uncased-finetuned-cola,Matthews Correlation,0.5406
161,0x7194633/roberta-base-spam-detector,eval_accuracy,0.9979
162,0x7194633/roberta-base-spam-detector,eval_f1,0.9980
163,0x7194633/roberta-base-spam-detector,eval_loss,0.0211


In [40]:
df_model_discussion

Unnamed: 0,context_id,downloads,limitation,discussion_title,hyperparam,test_metric,test_result,paper_title,model_task
6,0-hero/flan-OIG-base,1.0,,Adding `safetensors` variant of this model,,,,,
7,0-hero/flan-OIG-base,1.0,,Model,,,,,
8,0-hero/flan-OIG-small,1.0,,Adding `safetensors` variant of this model,,,,,
9,0-hero/flan-OIG-ul2,1.0,,,,,,,
10,0-hero/flan-OIG-xl,1.0,,,,,,,
...,...,...,...,...,...,...,...,...,...
325858,zzzzzy/ttttp,0.0,,,,,,,
325859,zzzzzz1q/z,0.0,,,,,,,
325860,zzzzzzttt/swin-tiny-patch4-window7-224-finetun...,8.0,,Adding `safetensors` variant of this model,,,,,
325861,zzzzzzttt/vit-base-patch16-224-finetuned-eurosat,8.0,,,,,,,


In [24]:
df_toy = df_model_discussion[~df_model_discussion['limitation'].isnull()][['context_id','limitation']]
df_toy

Unnamed: 0,context_id,limitation
76,0Tick/e621TagAutocomplete,Since DistilGPT2 is a distilled version of GPT...
77,0Tick/e621TagAutocomplete,Since DistilGPT2 is a distilled version of GPT...
174,0xDEADBEA7/DialoGPT-small-rick,The model may reflect biases present in the da...
175,0xDEADBEA7/DialoGPT-small-rick,The model may reflect biases present in the da...
267,1-800-BAD-CODE/punctuation_fullstop_truecase_e...,"This model was trained on news data, and may n..."
...,...,...
323742,zayedupal/movie-genre-prediction_distilbert-ba...,More information needed
323901,zekun-li/geolm-base-toponym-recognition,Significant research has explored bias and fai...
323991,zenham/khemx_m_e4_16h,The model may inherit biases from the training...
323992,zenham/khemx_m_e4_16h,The model may inherit biases from the training...
