In [36]:
import pandas as pd
import re

from transformers import pipeline, LlamaForCausalLM
from accelerate import Accelerator
import torch
import datasets

from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain
from langchain.chains.base import Chain
from langchain import PromptTemplate
from langchain.output_parsers.regex_dict import RegexDictParser

In [2]:
model_location = '/home/jovyan/project-archive/vicuna-7b'

model = LlamaForCausalLM.from_pretrained(
        model_location,
        load_in_8bit=True,
        torch_dtype=torch.float16,
        device_map={'': Accelerator().local_process_index},
        max_length=4096
    )


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/jovyan/conda_envs/peft/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /home/jovyan/conda_envs/peft/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /home/jovyan/conda_envs/peft/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [40]:
pipe = pipeline(model=model,
                tokenizer=model_location,
                use_fast=False,
                task='text-generation',
                model_kwargs={'load_in_8bit': True},
                max_length=4096,
                temperature=0.9,
                top_p=0.95,
                repetition_penalty=1.1,
               )

In [41]:
llm = HuggingFacePipeline(pipeline=pipe)

In [42]:
df = pd.read_csv('../data/aqag-chatgpt.csv', index_col=0)
df.columns

Index(['index', 'module', 'chapter', 'section', 'subsection', 'heading',
       'raw_text', 'clean_text', 'slug', 'question', 'correct_answer',
       'incorrect_answer', 'type'],
      dtype='object')

In [43]:
correct_response_template = (
    'The following is a passage from a macroeconomics textbook. Use the passage to generate a correct response to the question. '
    'The response should fully and directly address the question with no conceptual or factual errors. '
    'The response should be written in the voice of a student who has carefully read and understood the passage. '
    'The response should be written in 1-2 complete sentences.\n\n'
    'Passage:\n{source}\n\n'
    'Question:\n{question}\n\n'
    'Response:\n'
)

correct_response_prompt = PromptTemplate(
    input_variables=['source', 'question'],
    template=correct_response_template,
)

correct_response_chain = LLMChain(llm=llm, prompt=correct_response_prompt)

In [44]:
def correct_response(example):
    question = correct_response_chain.run(source = example['raw_text'], question = example['question'])
    return {'correct_answer_vicuna': question}

correct_response(example)

{'correct_answer_vicuna': 'Scarcity refers to the limited availability of resources, including labor, tools, land, and raw materials, which are necessary to produce goods and services but exist in limited supply.'}

In [46]:
ds = datasets.Dataset.from_pandas(df)
ds1 = ds.map(correct_response)
df1 = ds1.to_pandas().drop(columns = '__index_level_0__')

Map:   0%|          | 0/1569 [00:00<?, ? examples/s]

In [48]:
df1.to_csv('../data/aqag-chatgpt-vicuna.csv')

# Sample the questions for evaluation

In [23]:
import pandas as pd
df = pd.read_csv('../data/aqag-chatgpt-vicuna.csv', index_col=0)
df['id'] = df.apply(lambda row: str(row['index']) + '-' + row['type'], axis=1)

df = df[['id','slug', 'clean_text', 'type', 'question', 'correct_answer', 'correct_answer_vicuna', 'incorrect_answer']].dropna()

df.columns =['id', 'slug', 'clean_text', 'type', 'question_gpt', 'correct_answer_gpt', 'correct_answer_vicuna', 'incorrect_answer_gpt']

In [26]:
df_types = []
for type in df['type'].drop_duplicates():
    temp_df = df[df['type'] == type].sample(20)
    df_types.append(temp_df)
sampled_df = pd.concat(df_types)
print(len(sampled_df))

60


In [28]:
sampled_df.to_csv('../data/AQAG_chatgpt_vicuna_sample.csv', index=False)