In [1]:
!pip install transformers torch einops accelerate langchain

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Collecting einops
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.0.242-py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

model_id = "tiiuae/falcon-7b-instruct"
# model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)


pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [46]:
import pandas as pd
import sqlite3
import pickle

conn = sqlite3.connect('/content/drive/MyDrive/metaculus/metaculus.db')
data = pd.read_sql('select * from metaculus', conn)

with open("/content/drive/MyDrive/metaculus/metaculus_links.pkl", "rb") as f:
  q_data = pickle.load(f)

data2 = pd.read_csv('/content/drive/MyDrive/metaculus/metaculus_sep2021.csv', index_col=0)

In [52]:
data['id'] = data.url.str.extract('(\/\d{3,5})')
data['id'] = data.id.str.lstrip('\/')
data = data.dropna(subset=['id'])
data['id'] = data.id.astype(int)

data2 = data2.rename(columns={"title": 'title2'})

df = pd.merge(
    data,
    data2,
    how="inner",
    on="id"
)

In [94]:
from langchain.prompts import PromptTemplate
from langchain import HuggingFaceHub
from langchain.chains import SequentialChain, LLMChain

with open("/content/drive/MyDrive/metaculus/assets/HF_API_TOKEN.txt", "r") as f:
    hf_token = f.read()

llm = HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct",
                     huggingfacehub_api_token=hf_token,
                     model_kwargs={"pad_token_id": 11,
                     "max_length": 1500,
                     "do_sample": True,
                     "top_k": 10,
                     "num_return_sequences": 1,
                     "trust_remote_code": True})

In [95]:
template = """
For each instruction, write a high-quality description about the most capable and suitable agent to answer the instruction. In second person perspective.
[Instruction]: Make a list of 5 possible effects of deforestation.
[Agent Description]: You are an environmental scientist with a specialization in the study of ecosystems and their interactions with human activities. You have extensive knowledge about the effects of deforestation on the environment, including the impact on biodiversity, climate change, soil quality, water resources, and human health. Your work has been widely recognized and has contributed to the development of policies and regulations aimed at promoting sustainable forest management practices. You are equipped with the latest research findings, and you can provide a detailed and comprehensive list of the possible effects of deforestation, including but not limited to the loss of habitat for countless species, increased greenhouse gas emissions, reduced water quality and quantity, soil erosion, and the emergence of diseases. Your expertise and insights are highly valuable in understanding the complex interactions between human actions and the environment.
[Instruction]: Identify a descriptive phrase for an eclipse.
[Agent Description]: You are an astronomer with a deep understanding of celestial events and phenomena. Your vast knowledge and experience make you an expert in describing the unique and captivating features of an eclipse. You have witnessed and studied many eclipses throughout your career, and you have a keen eye for detail and nuance. Your descriptive phrase for an eclipse would be vivid, poetic, and scientifically accurate. You can capture the awe-inspiring beauty of the celestial event while also explaining the science behind it. You can draw on your deep knowledge of astronomy, including the movement of the sun, moon, and earth, to create a phrase that accurately and elegantly captures the essence of an eclipse. Your descriptive phrase will help others appreciate the wonder of this natural phenomenon.
[Instruction]: {question}
[Agent Description]:
"""

prompt_template = PromptTemplate(
    input_variables=["question"],
    template=template
)

agent_chain = LLMChain(
    llm=llm,
    prompt=prompt_template,
    output_key='agent_descr'
)

template = """
{agent_descr}
Now given above identity background, please answer the following instruction. You are required to answer either Yes or No, nothing else.
{question}
[Answer]:
"""

prompt_template = PromptTemplate(
    input_variables=['agent_descr', 'question'],
    template=template
)

binary_chain = LLMChain(
    llm=llm,
    prompt=prompt_template,
    output_key='answer'
)

overall_binary_chain = SequentialChain(
    chains=[agent_chain, binary_chain],
    input_variables=['question'],
    output_variables=['agent_descr', 'answer']
)

In [96]:
overall_binary_chain({'question': df.title.iloc[10]})

{'question': 'Will Abdel Fattah al-Burhan be removed from power in Sudan before June 15, 2023?',
 'agent_descr': 'You are a seasoned political analyst with a deep understanding of the complexities of politics and power. Your extensive',
 'answer': '- No'}

In [None]:
df_binary = df[df['forecast_type'] == 'binary']

df_binary['pred'] = [overall_binary_chain({'question': q})['answer'] for q in df_binary.title]

In [101]:
df_binary.pred

Series([], Name: pred, dtype: float64)