# Annotation Pipeline
## User interface, from running queries on elastic, all the way through to producing annotation forms

# 1. Query Elastic

In [19]:
from querytweets import queries

categories = [
    "Public authority actions, policy, and communications",
    "Community spread and impact",
    "Medical advice and self-treatments",
    "Claims about prominent actors",
    "Conspiracy theories",
    "Virus transmission",
    "Virus origin and properties",
    "Public preparedness, protests, and civil disobedience",
    "Vaccines, medical treatments, and tests"
]

amount_per_category = 2

tweets = []
for category in categories:
    tweet_ids = queries.random_rumours(amount_per_category, [category])
    tweet_texts = queries.text_from_id(tweet_ids)
    tweets = tweets + [{"id": tweet_id, "text": tweet_texts[i], "category": category} for i, tweet_id in enumerate(tweet_ids)]

print(tweets)

[{'id': '1281648596266348546', 'text': '#plandemic? Coronavirus conspiracy? #Covid-19 was ‘adapted to infect humans’ shock claim https://t.co/3aJH6yuViP', 'category': 'Public authority actions, policy, and communications'}, {'id': '1264481945905516544', 'text': 'People Are Growing Weary of Lockdowns: Serbians Riot Against Curfew\n#Covid #CCPVirus #Lockdown #Curfew \nhttps://t.co/FWjaZRvyuw', 'category': 'Public authority actions, policy, and communications'}, {'id': '1264481945905516544', 'text': '#Gaza in Lockdown to Try Contain Its First #COVID-19 Outbreak: https://t.co/0rvFN4d1y3 #COVID19 #coronavirus #pandemic #ChinaVirus #WuhanVirus', 'category': 'Community spread and impact'}, {'id': '1298268240011907072', 'text': '#plandemic? Coronavirus conspiracy? #Covid-19 was ‘adapted to infect humans’ shock claim https://t.co/3aJH6yuViP', 'category': 'Community spread and impact'}, {'id': '1248744406305341442', 'text': 'Fact Check: President Trump Did NOT Suggest Americans Drink Disinfectan

In [27]:
print(len(tweets))

18


# 2. Turns tweets into term-frequency dictionaries

In [20]:
from claimretrieval import tweet_query

tweet_queries = [{
    "id": tweet["id"],
    "query": tweet_query.construct_query(tweet["text"]), 
    "text": tweet["text"],
    "category": tweet["category"]
} for tweet in tweets]

print(tweet_queries)

[{'id': '1281648596266348546', 'query': {'#plandemic': 1, 'coronavirus': 1, 'conspiracy': 1, '#covid-19': 1, '‘adapted': 1, 'infect': 1, 'human': 1, 'shock': 1, 'claim': 1, 'https://t.co/3ajh6yuvip': 1}, 'text': '#plandemic? Coronavirus conspiracy? #Covid-19 was ‘adapted to infect humans’ shock claim https://t.co/3aJH6yuViP', 'category': 'Public authority actions, policy, and communications'}, {'id': '1264481945905516544', 'query': {'people': 1, 'growing': 1, 'weary': 1, 'lockdown': 1, 'serbian': 1, 'riot': 1, 'against': 1, 'curfew': 1, '#covid': 1, '#ccpvirus': 1, '#lockdown': 1, '#curfew': 1, 'https://t.co/fwjazrvyuw': 1}, 'text': 'People Are Growing Weary of Lockdowns: Serbians Riot Against Curfew\n#Covid #CCPVirus #Lockdown #Curfew \nhttps://t.co/FWjaZRvyuw', 'category': 'Public authority actions, policy, and communications'}, {'id': '1264481945905516544', 'query': {'#gaza': 1, 'lockdown': 1, 'contain': 1, 'first': 1, '#covid-19': 1, 'outbreak': 1, 'https://t.co/0rvfn4d1y3': 1, '#c

In [28]:
print(len(tweet_queries))

18


# 3. Create claim retrievers

In [21]:
from claimretrieval import claim_index
from claimretrieval.retriever import Retriever

# Dictionary of { category: index } where each index is { word: {doc_id: frequency} }
category_indices = claim_index.construct_indices(categories, "./claimretrieval/IndexClaimCategory.csv")

category_retrievers = {category: Retriever(category_indices[category]) for category in categories}

print(category_retrievers)

{'Public authority actions, policy, and communications': <claimretrieval.retriever.Retriever object at 0x000001C1548E95B0>, 'Community spread and impact': <claimretrieval.retriever.Retriever object at 0x000001C1548E96A0>, 'Medical advice and self-treatments': <claimretrieval.retriever.Retriever object at 0x000001C1548E96D0>, 'Claims about prominent actors': <claimretrieval.retriever.Retriever object at 0x000001C14AC35490>, 'Conspiracy theories': <claimretrieval.retriever.Retriever object at 0x000001C154D7DA60>, 'Virus transmission': <claimretrieval.retriever.Retriever object at 0x000001C154D7D2B0>, 'Virus origin and properties': <claimretrieval.retriever.Retriever object at 0x000001C14AC2C190>, 'Public preparedness, protests, and civil disobedience': <claimretrieval.retriever.Retriever object at 0x000001C14AC5E040>, 'Vaccines, medical treatments, and tests': <claimretrieval.retriever.Retriever object at 0x000001C14AC5E0A0>}


# 4. Retrieve claim shortlist for each tweet, for each category

In [22]:
import pandas as pd

claim_df = pd.read_csv("./claimretrieval/IndexClaimCategory.csv")
claim_df.dropna(axis='columns', how='all', inplace=True)  # Drop any columns that are all N/A
claim_df.dropna(axis='index', how='any', inplace=True)  # Drop any rows that have at least one N/A

def claim_text_from_ids(claim_ids):
    return [row['Claim'] for i, row in claim_df.iterrows() if int(row['Index']) in claim_ids]

In [23]:
def shortlist_claims(query):
    shortlist = []
    for category in categories:
        claim_indices = category_retrievers[category].shortlist(query)
        claim_texts = claim_text_from_ids(claim_indices)
        shortlist = shortlist + [{"rumourID": claim_id, "category": category, "description": claim_text}
                                for (claim_id, claim_text) in zip(claim_indices, claim_texts)]
    return shortlist

In [24]:
known_claim_ids = []
known_claims = {}

for tweet_query in tweet_queries:
    shortlist = shortlist_claims(tweet_query["query"])
    
    for claim in shortlist:
        if claim["rumourID"] not in known_claim_ids:
            known_claim_ids.append(claim["rumourID"])
            known_claims[str(claim["rumourID"])] = claim
    
    tweet_query["shortlist"] = [claim["rumourID"] for claim in shortlist]

In [29]:
print(len(known_claims))

654


# 5. Produce JSONs for the annotation form generator

In [25]:
import json

tweet_sample = [{"tweetID": str(tweet_query["id"]), "text": tweet_query["text"], "rumourShortlist": tweet_query["shortlist"]} 
                for tweet_query in tweet_queries]
print(json.dumps({"tweetSample": tweet_sample}))

{"tweetSample": [{"tweetID": "1281648596266348546", "text": "#plandemic? Coronavirus conspiracy? #Covid-19 was \u2018adapted to infect humans\u2019 shock claim https://t.co/3aJH6yuViP", "rumourShortlist": [407, 417, 315, 544, 232, 611, 394, 994, 829, 1008, 684, 1164, 1156, 1428, 78, 1432, 648, 660, 1210, 412, 498, 593, 1100, 843, 443, 853, 865, 916, 220, 991, 233, 466, 1357, 1175, 958, 802, 295, 1165, 1294, 815, 147, 1301, 563, 252, 846, 1123, 400, 225, 680, 1091, 897, 939, 1366, 134, 969, 319, 1248, 210, 335, 136, 138, 178, 364, 1442, 1431, 311, 25, 270, 885, 366, 173, 174, 816, 1379, 1335, 987, 847, 1232, 399, 356, 572, 302, 143, 378, 639, 128, 1169, 1021, 540, 787, 833, 376, 259, 557, 504, 505, 562, 235, 159]}, {"tweetID": "1264481945905516544", "text": "People Are Growing Weary of Lockdowns: Serbians Riot Against Curfew\n#Covid #CCPVirus #Lockdown #Curfew \nhttps://t.co/FWjaZRvyuw", "rumourShortlist": [845, 426, 544, 547, 1362, 521, 618, 95, 201, 627, 634, 133, 68, 886, 1238, 78, 3

In [26]:
print(json.dumps(known_claims))

{"407": {"rumourID": 407, "category": "Public authority actions, policy, and communications", "description": "A post on Facebook and Twitter claims an official advisory was issued by Singapore\u2019s Ministry of Health about the first symptoms of the novel coronavirus.\t\t"}, "417": {"rumourID": 417, "category": "Public authority actions, policy, and communications", "description": "Trump\u2019s claim that he imposed the first travel ban on China.\t\t"}, "315": {"rumourID": 315, "category": "Public authority actions, policy, and communications", "description": "Images of the medical facility shared with the claim that the Indian Army set up a 1,000-bed hospital in Rajasthan during the coronavirus pandemic.\t\t"}, "544": {"rumourID": 544, "category": "Public authority actions, policy, and communications", "description": "An image with human body figurines in a bowl that claims China is serving human baby soup in its restaurants.\t\t"}, "232": {"rumourID": 232, "category": "Public author

# 6. Use the annotation form generator


Copy the JSONs from the above cell outputs into the form generator here: https://script.google.com/a/macros/sheffield.ac.uk/s/AKfycbwms3pmFMBCbTJlf_FzdSv_RjsbI6aulesUKMZiRI7nxeFqet7EbCxfSPVR01sSRzT1/exec