# Annotation Pipeline
## User interface, from running queries on elastic, all the way through to producing annotation forms

# Construct queries from claims

In [20]:
import pandas as pd
from querytweets import queries
from claimretrieval import tweet_query

claim_df = pd.read_csv("./claimretrieval/IndexClaimCategory.csv")
claim_df.dropna(axis='columns', how='all', inplace=True)  # Drop any columns that are all N/A
claim_df.dropna(axis='index', how='any', inplace=True)  # Drop any rows that have at least one N/A

# ToDo: For random sample of claims of different categories?
row_index = 111

# Get query
claim_id = claim_df.iloc[row_index]['Index']
claim_category = claim_df.iloc[row_index]['Category']
claim_text = claim_df.iloc[row_index]['Claim']
claim_query = tweet_query.construct_query(claim_text) # Term frequency dict
print(claim_query)

{'disaster': 1, 'management': 1, 'implemented': 1, 'across': 1, 'india.': 1, 'apart': 1, 'government': 1, 'department': 1, 'citizen': 1, 'allowed': 1, 'post': 1, 'share': 1, 'forward': 1, 'related': 1, 'novel': 1, 'coronavirus': 1}


# 1. Query Elastic

In [37]:
# Query elastic for tweets based on claims
# ToDo: Multiple claims
tweet_ids = queries.search_for_terms(10, claim_query)
tweet_texts = queries.text_from_id(tweet_ids)
all_tweets = [{"id": tweet_id, "text": tweet_text} for tweet_id, tweet_text in zip(tweet_ids, tweet_texts)]

In [38]:
# Filter repeats
# ToDo: Account for high similarity...
unique_texts = []
tweets = []
for tweet in all_tweets:
    if tweet["text"] not in unique_texts:
        unique_texts.append(tweet["text"])
        tweets.append(tweet)

print(tweets)

[{'id': '1266330966047555585', 'text': '#CCPVirus_JOKE\n\n3 people in the morgue. 1 black, 1 white, 1 mexican\n\n1 confirmed to have diabetes\n1 confirmed HIV/AIDS\n1 has a bullet hole in the head\n\nCounty public health official states, ALL are #CoronaVirus related\n \n#WuhanVirus #COVID19\n\nqt-covid19-jokes-005 #BorderObserver'}, {'id': '1278796143917518849', 'text': 'Trump orange tan formula: 1 part #disinfectant, 1 part sunlight &amp; 1 part #Hydroxychloroquine.\n@TitusNation'}, {'id': '1283899724706263042', 'text': '#CCPVirus Joke\n\n3 people in the morgue. 1 black, 1 white, 1 mexican\n\n1 confirmed to have diabetes\n1 confirmed HIV/AIDS\n1 has a bullet hole in the head\n\nCounty public health official states, ALL are #CoronaVirus related'}, {'id': '1289697929909936133', 'text': 'Generally, society is in 1/3rds. 1 questions gov, 1 follows the consensus, 1 is under the bed. \n\nI do hope the consensus 1/3 come to their senses soon. \n\n#endthelockdown'}, {'id': '129298961907682508

# 2. Turns tweets into term-frequency dictionaries

In [39]:
from claimretrieval import tweet_query

tweet_queries = [{
    "id": tweet["id"],
    "query": tweet_query.construct_query(tweet["text"]), 
    "text": tweet["text"]
} for tweet in tweets]

print(tweet_queries)

[{'id': '1266330966047555585', 'query': {'#ccpvirus_joke': 1, 'people': 1, 'morgue.': 1, 'black': 1, 'white': 1, 'mexican': 1, 'confirmed': 2, 'diabetes': 1, 'hiv/aids': 1, 'bullet': 1, 'hole': 1, 'head': 1, 'county': 1, 'public': 1, 'health': 1, 'official': 1, 'state': 1, '#coronavirus': 1, 'related': 1, '#wuhanvirus': 1, '#covid19': 1, 'qt-covid19-jokes-005': 1, '#borderobserver': 1}, 'text': '#CCPVirus_JOKE\n\n3 people in the morgue. 1 black, 1 white, 1 mexican\n\n1 confirmed to have diabetes\n1 confirmed HIV/AIDS\n1 has a bullet hole in the head\n\nCounty public health official states, ALL are #CoronaVirus related\n \n#WuhanVirus #COVID19\n\nqt-covid19-jokes-005 #BorderObserver'}, {'id': '1278796143917518849', 'query': {'trump': 1, 'orange': 1, 'formula': 1, 'part': 3, '#disinfectant': 1, 'sunlight': 1, '&amp': 1, '#hydroxychloroquine.': 1, '@titusnation': 1}, 'text': 'Trump orange tan formula: 1 part #disinfectant, 1 part sunlight &amp; 1 part #Hydroxychloroquine.\n@TitusNation'},

In [40]:
print(len(tweet_queries))

7


# 3. Create claim retrievers

In [41]:
from claimretrieval import claim_index
from claimretrieval.retriever import Retriever

index = claim_index.construct_index(file_path="./claimretrieval/IndexClaimCategory.csv")

retriever = Retriever(category_index)

print(index)



# 4. Retrieve claim shortlist for each tweet, for each category

In [42]:
import pandas as pd

claim_df = pd.read_csv("./claimretrieval/IndexClaimCategory.csv")
claim_df.dropna(axis='columns', how='all', inplace=True)  # Drop any columns that are all N/A
claim_df.dropna(axis='index', how='any', inplace=True)  # Drop any rows that have at least one N/A

In [43]:
known_claim_ids = []
known_claims = {}

for tweet_query in tweet_queries:
    claim_indices = category_retriever.shortlist(tweet_query["query"])
    shortlist = [{"rumourID": int(row['Index']), "category": row['Category'], "description": row['Claim']}
        for i, row in claim_df.iterrows() if int(row['Index']) in claim_indices]
    
    # Add original claim to front of shortlist
    shortlist.insert(0, {"rumourID": claim_id, "category": claim_category, "description": claim_text})
    
    for claim in shortlist:
        if claim["rumourID"] not in known_claim_ids:
            known_claim_ids.append(claim["rumourID"])
            known_claims[str(claim["rumourID"])] = claim
    
    tweet_query["shortlist"] = [claim["rumourID"] for claim in shortlist]

In [44]:
print(len(known_claims))

41


# 5. Produce JSONs for the annotation form generator

In [45]:
import json

tweet_sample = [{"tweetID": str(tweet_query["id"]), "text": tweet_query["text"], "rumourShortlist": tweet_query["shortlist"]} 
                for tweet_query in tweet_queries]
print(json.dumps({"tweetSample": tweet_sample}))

{"tweetSample": [{"tweetID": "1266330966047555585", "text": "#CCPVirus_JOKE\n\n3 people in the morgue. 1 black, 1 white, 1 mexican\n\n1 confirmed to have diabetes\n1 confirmed HIV/AIDS\n1 has a bullet hole in the head\n\nCounty public health official states, ALL are #CoronaVirus related\n \n#WuhanVirus #COVID19\n\nqt-covid19-jokes-005 #BorderObserver", "rumourShortlist": [113.0, 423, 473, 583, 963, 1038, 1184, 1220, 1364, 1369, 1448, 1467]}, {"tweetID": "1278796143917518849", "text": "Trump orange tan formula: 1 part #disinfectant, 1 part sunlight &amp; 1 part #Hydroxychloroquine.\n@TitusNation", "rumourShortlist": [113.0, 69, 193, 560, 584, 639, 671, 789, 838, 887, 967, 1008]}, {"tweetID": "1283899724706263042", "text": "#CCPVirus Joke\n\n3 people in the morgue. 1 black, 1 white, 1 mexican\n\n1 confirmed to have diabetes\n1 confirmed HIV/AIDS\n1 has a bullet hole in the head\n\nCounty public health official states, ALL are #CoronaVirus related", "rumourShortlist": [113.0, 423, 473, 58

In [46]:
print(json.dumps(known_claims))

{"113.0": {"rumourID": 113.0, "category": "Public authority actions, policy, and communications", "description": "The Disaster Management Act has been implemented across India. As per the act, apart from the government departments, no other citizen is allowed to post or share any forward related to the novel coronavirus.\t\t"}, "423": {"rumourID": 423, "category": "Public authority actions, policy, and communications", "description": "Paraguay\u2019s Ministry of Health is not following up people who are related to confirmed cases of coronavirus.\t\t"}, "473": {"rumourID": 473, "category": "Community spread and impact", "description": "23 new COVID-19 cases confirmed in Mombasa.\t\t"}, "583": {"rumourID": 583, "category": "Community spread and impact", "description": "Black people are immune to the coronavirus.\t\t"}, "963": {"rumourID": 963, "category": "Community spread and impact", "description": "Does Mexico have confirmed cases of COVID-19?\t\t"}, "1038": {"rumourID": 1038, "catego

# 6. Use the annotation form generator


Copy the JSONs from the above cell outputs into the form generator here: https://script.google.com/a/macros/sheffield.ac.uk/s/AKfycbyVFaLRlZrgQYsZTGPbBirRA6maY5WD1CGSZOHPS31l5ThHxQxoDtgIQssYXUSTVl3r/exec