# Annotation Pipeline
## User interface, from running queries on elastic, all the way through to producing annotation forms

# 1. Query Elastic

In [6]:
from querytweets import queries

categories = [
    "Public authority actions, policy, and communications",
    "Community spread and impact",
    "Medical advice and self-treatments",
    "Claims about prominent actors",
    "Conspiracy theories",
    "Virus transmission",
    "Virus origin and properties",
    "Public preparedness, protests, and civil disobedience",
    "Vaccines, medical treatments, and tests"
]

amount = 10

tweets = []
for category in categories:
    tweet_ids = queries.random_rumours(amount, [category])
    tweet_texts = queries.text_from_id(tweet_ids)
    tweets = tweets + [{"id": tweet_id, "text": tweet_texts[i], "category": category} for i, tweet_id in enumerate(tweet_ids)]

print(tweets)

TransportError: TransportError(504, '<html>\r\n<head><title>504 Gateway Time-out</title></head>\r\n<body>\r\n<center><h1>504 Gateway Time-out</h1></center>\r\n<hr><center>nginx/1.17.10</center>\r\n</body>\r\n</html>\r\n')

# 2. Turns tweets into term-frequency dictionaries

In [None]:
from claimretrieval import tweet_query

tweet_queries = [{
    "id": tweet["id"],
    "query": tweet_query.construct_query(tweet["text"]), 
    "text": tweet["text"],
    "category": tweet["category"]
} for tweet in tweets]

print(tweet_queries)

# 3. Create claim retrievers

In [None]:
from claimretrieval import claim_index
from claimretrieval.retriever import Retriever

# Dictionary of { category: index } where each index is { word: {doc_id: frequency} }
category_indices = claim_index.construct_indices(categories, "./claimretrieval/IndexClaimCategory.csv")

category_retrievers = {category: Retriever(category_indices[category]) for category in categories}

print(category_retrievers)

# 4. Retrieve claim shortlist for each tweet, for each category

In [None]:
import pandas as pd

claim_df = pd.read_csv("./claimretrieval/IndexClaimCategory.csv")
claim_df.dropna(axis='columns', how='all', inplace=True)  # Drop any columns that are all N/A
claim_df.dropna(axis='index', how='any', inplace=True)  # Drop any rows that have at least one N/A

def claim_text_from_ids(claim_ids):
    return [row['Claim'] for i, row in df.iterrows() if int(row['Index']) in claim_ids]

In [None]:
tweet_shortlists = []

for tweet_query in tweet_queries:
    tweet_id = tweet_query["id"]
    query = tweet_query["query"]
    text = tweet_query["text"]
    cat = tweet_query["category"]
    
    # Most relevant claims shortlist
    shortlist = []
    for category in categories:
        claim_indices = category_retrievers[category].shortlist(query)
        claim_texts = claim_text_from_ids(claim_indices)
        shortlist = shortlist + [{"rumourID": claim_id, "category": category, "description": claim_text}
                                for (claim_id, claim_text) in zip(claim_indices, claim_texts)]
    
    tweet_shortlists.append({"tweet_id": tweet_id, "content": text, "shortlist": shortlist, "category": cat})
    
print(tweet_shortlists)

# 5. Produce JSONs for the annotation form generator

In [None]:
import json

tweetsToAnnotate = { "tweetSample": }

# 6. Use the annotation form generator
