# Generate Prompts with Poynter Context to feed to LLM Model

In [1]:
import os
from os import listdir
from os.path import isfile, join

import operator
import json

import pandas as pd
import numpy as np

import gensim
import gensim.downloader as gensim_api
import transformers

from transformers import AutoTokenizer, AutoModel
import torch
from datasets import Dataset

import src.text_preprocess as tp
from src.create_vdb import IndexTextEmbeddings
from src.prompt_generator import TweetPromptGenerator

In [2]:
this_file_path = os.path.abspath(os.getcwd()) # parse this out so that it works
# get project root location
project_root = os.path.split(os.path.split(this_file_path)[0])[0]
data_dir = os.path.join(project_root, 'data/')
tfidf_data_dir = os.path.join(project_root, 'data/tfidf/')

poynter_raw = pd.read_csv(os.path.join(data_dir, 'poynter_coded_breon_tab.csv'), encoding='utf8')

poynter_text = poynter_raw['story_copy']
text = poynter_text.apply(tp.clean_text)
poynter_raw['clean_text'] = np.array(text)

In [3]:
create_vector_df = IndexTextEmbeddings(model_name='sentence-transformers/multi-qa-mpnet-base-dot-v1')
dataset = create_vector_df.create_dataset(poynter_raw, 'clean_text')
dataset_with_index = create_vector_df.add_faiss_index(dataset, 'embeddings')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
tweet_prompt_generator = TweetPromptGenerator(dataset_with_index, 'sentence-transformers/multi-qa-mpnet-base-dot-v1', project_root)

misinformed_tweets = [
    "Drinking lemon juice cures COVID in 24 hours.",
    "COVID-19 was created as a form of population control.",
    "CDC says masks increase your chances of getting COVID.",
    "Vaccines don’t protect you; all my vaccinated friends got sick.",
    "My neighbor got the vaccine and turned into a zombie.",
    "You can catch COVID-19 from talking on the phone with an infected person.",
    "COVID-19 was brought to Earth by aliens.",
    "Once you recover from COVID, you turn invisible to viruses.",
    "Government says COVID can be defeated by singing the national anthem."
]

In [5]:
# clean up first
def clean_text_list(text_list):
    return [tp.clean_text(text) for text in text_list]

misinformed_tweets = clean_text_list(misinformed_tweets)

prompts, stories, themes, similarity_scores = tweet_prompt_generator.generate_prompts_for_tweets(misinformed_tweets, clean_tweets=False)

In [6]:
for prompt in prompts:
    print(prompt)


Query: drinking lemon juice cures covid in COUNT hours 

I have found several relevant pieces of information based on your query. Here they are:

Document ID 1272 (Similarity Score: 31.69):
lemon juice and bicarbonate mixture prevents and cures covid-19 in israel.

Document ID 1109 (Similarity Score: 43.93):
the drug diamox, often used as treatment for high-altitude sickness, can cure coronavirus.

Document ID 1232 (Similarity Score: 44.09):
israeli recipe for lemon and bicarbonate drink is a coronavirus cure.



Based on this information, [Your specific question or task for the LLM].

Query: covid_19 was created as  form of population control 

I have found several relevant pieces of information based on your query. Here they are:

Document ID 1171 (Similarity Score: 27.90):
the number of deaths from covid-19 are exaggerated.

Document ID 1164 (Similarity Score: 30.04):
us arrests the man who created covid-19.

Document ID 1096 (Similarity Score: 31.13):
“this is covid-19, not covid-1