# Annotation Pipeline
## User interface, from running queries on elastic, all the way through to producing annotation forms

# 1. Query Elastic

In [1]:
from querytweets import queries

categories = [
    "Public authority actions, policy, and communications",
    "Community spread and impact",
    "Medical advice and self-treatments",
    "Claims about prominent actors",
    "Conspiracy theories",
    "Virus transmission",
    "Virus origin and properties",
    "Public preparedness, protests, and civil disobedience",
    "Vaccines, medical treatments, and tests"
]

amount = 10

tweets = []
for category in categories:
    tweet_ids = queries.random_rumours(amount, [category])
    tweet_texts = queries.text_from_id(tweet_ids)
    tweets = tweets + [{"id": tweet_id, "text": tweet_texts[i], "category": category} for i, tweet_id in enumerate(tweet_ids)]

print(tweets)

[{'id': '1281648596266348546', 'text': 'Is there any other country who throws a party on the street after a new law comes in - 10pm curfew #London #lockdown #COVIDIOTS', 'category': 'Public authority actions, policy, and communications'}, {'id': '1264481945905516544', 'text': 'Covid Toes among kids: New symptom of novel coronavirus infection - Coronavirus Outbreak News https://t.co/pru83qS46Q\n\n#Covid_19 #coronaupdatesindia #FightAgainstCoronavirus #coronasymptoms', 'category': 'Public authority actions, policy, and communications'}, {'id': '1247820227469029377', 'text': '🇱🇰 🕌Misleading claim circulates that Muslims ignored COVID-19 curfew at Sri Lankan mosque https://t.co/0d9QkUWNTz\n\n#CoronavirusFacts', 'category': 'Public authority actions, policy, and communications'}, {'id': '1267005377553543168', 'text': '#Pakistan need strick 15 days curfew to stop ⛔ #Covid_19 spread #Lockdown will not help #CoronainPakistan #CoronaPandemic #COVID19Pakistan #COVIDIOTS #COVID19Pakistan', 'categ

## For testing

# 2. Turns tweets into term-frequency dictionaries

In [3]:
from claimretrieval import tweet_query

tweet_queries = [{
    "id": tweet["id"],
    "query": tweet_query.construct_query(tweet["text"]), 
    "text": tweet["text"],
    "category": tweet["category"]
} for tweet in tweets]

print(tweet_queries)

[{'id': 0, 'query': {'coronaviru': 1, 'like': 1, 'smoronaviru': 1, 'amirit': 1}, 'text': 'coronavirus more like smoronavirus amirite guys lol', 'category': 'Public authority actions, policy, and communications'}, {'id': 1, 'query': {'coronaviru': 1, 'like': 1, 'smoronaviru': 1, 'amirit': 1}, 'text': 'coronavirus more like smoronavirus amirite guys lol', 'category': 'Community spread and impact'}, {'id': 2, 'query': {'coronaviru': 1, 'like': 1, 'smoronaviru': 1, 'amirit': 1}, 'text': 'coronavirus more like smoronavirus amirite guys lol', 'category': 'Medical advice and self-treatments'}, {'id': 3, 'query': {'coronaviru': 1, 'like': 1, 'smoronaviru': 1, 'amirit': 1}, 'text': 'coronavirus more like smoronavirus amirite guys lol', 'category': 'Claims about prominent actors'}, {'id': 4, 'query': {'coronaviru': 1, 'like': 1, 'smoronaviru': 1, 'amirit': 1}, 'text': 'coronavirus more like smoronavirus amirite guys lol', 'category': 'Conspiracy theories'}, {'id': 5, 'query': {'coronaviru': 1, '

# 3. Create claim retrievers

In [4]:
from claimretrieval import claim_index
from claimretrieval.retriever import Retriever

# Dictionary of { category: index } where each index is { word: {doc_id: frequency} }
category_indices = claim_index.construct_indices(categories, "./claimretrieval/IndexClaimCategory.csv")

category_retrievers = {category: Retriever(category_indices[category]) for category in categories}

print(category_retrievers)

{'Public authority actions, policy, and communications': <claimretrieval.retriever.Retriever object at 0x000002094EE19A90>, 'Community spread and impact': <claimretrieval.retriever.Retriever object at 0x000002094EE19A00>, 'Medical advice and self-treatments': <claimretrieval.retriever.Retriever object at 0x0000020956A5BAF0>, 'Claims about prominent actors': <claimretrieval.retriever.Retriever object at 0x0000020956A3C3D0>, 'Conspiracy theories': <claimretrieval.retriever.Retriever object at 0x0000020956A3C370>, 'Virus transmission': <claimretrieval.retriever.Retriever object at 0x0000020956A3C880>, 'Virus origin and properties': <claimretrieval.retriever.Retriever object at 0x0000020956A63FA0>, 'Public preparedness, protests, and civil disobedience': <claimretrieval.retriever.Retriever object at 0x0000020956A63E80>, 'Vaccines, medical treatments, and tests': <claimretrieval.retriever.Retriever object at 0x0000020956A63F40>}


# 4. Retrieve claim shortlist for each tweet, for each category

In [8]:
import pandas as pd

claim_df = pd.read_csv("./claimretrieval/IndexClaimCategory.csv")
claim_df.dropna(axis='columns', how='all', inplace=True)  # Drop any columns that are all N/A
claim_df.dropna(axis='index', how='any', inplace=True)  # Drop any rows that have at least one N/A

def claim_text_from_ids(claim_ids):
    return [row['Claim'] for i, row in claim_df.iterrows() if int(row['Index']) in claim_ids]

In [9]:
def shortlist_claims(query):
    shortlist = []
    for category in categories:
        claim_indices = category_retrievers[category].shortlist(query)
        claim_texts = claim_text_from_ids(claim_indices)
        shortlist = shortlist + [{"rumourID": claim_id, "category": category, "description": claim_text}
                                for (claim_id, claim_text) in zip(claim_indices, claim_texts)]
    return shortlist

In [13]:
known_claim_ids = []
known_claims = {}

for tweet_query in tweet_queries:
    shortlist = shortlist_claims(tweet_query["query"])
    
    for claim in shortlist:
        if claim["rumourID"] not in known_claim_ids:
            known_claim_ids.append(claim["rumourID"])
            known_claims[str(claim["rumourID"])] = claim
    
    tweet_query["shortlist"] = [claim["rumourID"] for claim in shortlist]

print(known_claims)

{'98': {'rumourID': 98, 'category': 'Public authority actions, policy, and communications', 'description': 'A Dutch minister said that “giving money to Portuguese politicians is like putting a cat in a sardines basket”.\t\t'}, '981': {'rumourID': 981, 'category': 'Public authority actions, policy, and communications', 'description': 'If Florida banned alcohol sales due to coronavirus.\t\t'}, '1291': {'rumourID': 1291, 'category': 'Public authority actions, policy, and communications', 'description': 'NY banned cigarette sales because of the coronavirus.\t\t'}, '994': {'rumourID': 994, 'category': 'Public authority actions, policy, and communications', 'description': 'Coronavirus patients were being arrested in Punjab, India.\t\t'}, '757': {'rumourID': 757, 'category': 'Public authority actions, policy, and communications', 'description': 'Video shows Chinese police crackdown on coronavirus patients.\t\t'}, '389': {'rumourID': 389, 'category': 'Public authority actions, policy, and comm

# 5. Produce JSONs for the annotation form generator

In [19]:
import json

tweet_sample = [{"tweetID": str(tweet_query["id"]), "text": tweet_query["text"], "rumourShortlist": tweet_query["shortlist"]} 
                for tweet_query in tweet_queries]
print(json.dumps({"tweetSample": tweet_sample}))

{"tweetSample": [{"tweetID": "0", "text": "coronavirus more like smoronavirus amirite guys lol", "rumourShortlist": [98, 981, 1291, 994, 757, 389, 1158, 1272, 735, 750, 382, 1359, 983, 1465, 1038, 682, 599, 1289, 342, 1435, 1409, 1421, 466, 1357, 1094, 1271, 1270, 1344, 1239, 468, 742, 1312, 1206, 986, 464, 1353, 597, 274, 1119, 1297, 662, 340, 388, 1019, 347, 1233, 1173, 710, 664, 1190, 1027, 1355, 1356, 1159, 703, 1146, 1431, 311, 516, 818, 598, 765, 119, 319, 840, 209, 717, 816, 1379, 1335, 987, 836, 941, 1480, 1311, 847, 1348, 1331, 1148, 1021, 1169, 1231, 1232, 356, 572, 327, 83, 302, 740, 888, 1199, 575, 486, 1207, 1035, 497, 918, 238, 202]}, {"tweetID": "1", "text": "coronavirus more like smoronavirus amirite guys lol", "rumourShortlist": [98, 981, 1291, 994, 757, 389, 1158, 1272, 735, 750, 382, 1359, 983, 1465, 1038, 682, 599, 1289, 342, 1435, 1409, 1421, 466, 1357, 1094, 1271, 1270, 1344, 1239, 468, 742, 1312, 1206, 986, 464, 1353, 597, 274, 1119, 1297, 662, 340, 388, 1019, 34

In [18]:
print(json.dumps(known_claims))

{"98": {"rumourID": 98, "category": "Public authority actions, policy, and communications", "description": "A Dutch minister said that \u201cgiving money to Portuguese politicians is like putting a cat in a sardines basket\u201d.\t\t"}, "981": {"rumourID": 981, "category": "Public authority actions, policy, and communications", "description": "If Florida banned alcohol sales due to coronavirus.\t\t"}, "1291": {"rumourID": 1291, "category": "Public authority actions, policy, and communications", "description": "NY banned cigarette sales because of the coronavirus.\t\t"}, "994": {"rumourID": 994, "category": "Public authority actions, policy, and communications", "description": "Coronavirus patients were being arrested in Punjab, India.\t\t"}, "757": {"rumourID": 757, "category": "Public authority actions, policy, and communications", "description": "Video shows Chinese police crackdown on coronavirus patients.\t\t"}, "389": {"rumourID": 389, "category": "Public authority actions, policy

# 6. Use the annotation form generator


Copy the JSONs from the above cell outputs into the form generator here: https://script.google.com/a/macros/sheffield.ac.uk/s/AKfycbwms3pmFMBCbTJlf_FzdSv_RjsbI6aulesUKMZiRI7nxeFqet7EbCxfSPVR01sSRzT1/exec