# Annotation Pipeline
## User interface, from running queries on elastic, all the way through to producing annotation forms

# Construct queries from claims

In [1]:
import pandas as pd
from querytweets import queries
from claimretrieval import tweet_query

claim_df = pd.read_csv("./claimretrieval/IndexClaimCategory.csv")
claim_df.dropna(axis='columns', how='all', inplace=True)  # Drop any columns that are all N/A
claim_df.dropna(axis='index', how='any', inplace=True)  # Drop any rows that have at least one N/A

# ToDo: For random sample of claims of different categories?
row_index = 5

# Get query
claim_id = claim_df.iloc[row_index]['Index']
claim_category = claim_df.iloc[row_index]['Category']
claim_text = claim_df.iloc[row_index]['Claim']

#claim_text = "president Donald Trump white house drink hydroxychloroquine inject bleach treat coronavirus signed bill vaccine 5G"
claim_text = "Queen Elizabeth tests positive for COVID-19"

claim_query = tweet_query.construct_query(claim_text) # Term frequency dict
print(claim_query)

[{'id': '1281648596266348546', 'text': 'Is there any other country who throws a party on the street after a new law comes in - 10pm curfew #London #lockdown #COVIDIOTS', 'category': 'Public authority actions, policy, and communications'}, {'id': '1264481945905516544', 'text': 'Covid Toes among kids: New symptom of novel coronavirus infection - Coronavirus Outbreak News https://t.co/pru83qS46Q\n\n#Covid_19 #coronaupdatesindia #FightAgainstCoronavirus #coronasymptoms', 'category': 'Public authority actions, policy, and communications'}, {'id': '1247820227469029377', 'text': '🇱🇰 🕌Misleading claim circulates that Muslims ignored COVID-19 curfew at Sri Lankan mosque https://t.co/0d9QkUWNTz\n\n#CoronavirusFacts', 'category': 'Public authority actions, policy, and communications'}, {'id': '1267005377553543168', 'text': '#Pakistan need strick 15 days curfew to stop ⛔ #Covid_19 spread #Lockdown will not help #CoronainPakistan #CoronaPandemic #COVID19Pakistan #COVIDIOTS #COVID19Pakistan', 'categ

In [None]:
claim_text

# 1. Query Elastic

In [None]:
# Query elastic for tweets based on claims
# ToDo: Multiple claims
ids_and_scores = queries.search_for_terms(100, claim_query)

scores = [score for twid, score in ids_and_scores]
tweet_ids = [twid for twid, score in ids_and_scores]

tweet_texts = queries.text_from_id(tweet_ids)

all_tweets = [{"id": tweet_id, "text": tweet_text} for tweet_id, tweet_text in zip(tweet_ids, tweet_texts)]

In [None]:
from matplotlib import pyplot as plt

plt.plot(range(len(scores)), scores)
plt.tight_layout()
plt.show()

In [None]:
# Filter repeats
# ToDo: Account for high similarity...
unique_texts = []
tweets = []
for tweet in all_tweets:
    if tweet["text"] not in unique_texts:
        unique_texts.append(tweet["text"])
        tweets.append(tweet)

print(tweets)

# 2. Turns tweets into term-frequency dictionaries

In [2]:
from claimretrieval import tweet_query

tweet_queries = [{
    "id": tweet["id"],
    "query": tweet_query.construct_query(tweet["text"]), 
    "text": tweet["text"]
} for tweet in tweets]

print(tweet_queries)

FileNotFoundError: File b'../RumourDatabase.csv' does not exist

In [None]:
print(len(tweet_queries))

# 3. Create claim retrievers

In [None]:
from claimretrieval import claim_index
from claimretrieval.retriever import Retriever

index = claim_index.construct_index(file_path="./claimretrieval/IndexClaimCategory.csv")

retriever = Retriever(category_index)

print(index)

# 4. Retrieve claim shortlist for each tweet, for each category

In [None]:
import pandas as pd

claim_df = pd.read_csv("./claimretrieval/IndexClaimCategory.csv")
claim_df.dropna(axis='columns', how='all', inplace=True)  # Drop any columns that are all N/A
claim_df.dropna(axis='index', how='any', inplace=True)  # Drop any rows that have at least one N/A

In [None]:
known_claim_ids = []
known_claims = {}

for tweet_query in tweet_queries:
    claim_indices = category_retriever.shortlist(tweet_query["query"])
    shortlist = [{"rumourID": int(row['Index']), "category": row['Category'], "description": row['Claim']}
        for i, row in claim_df.iterrows() if int(row['Index']) in claim_indices]
    
    # Add original claim to front of shortlist
    shortlist.insert(0, {"rumourID": int(claim_id), "category": claim_category, "description": claim_text})
    
    for claim in shortlist:
        if claim["rumourID"] not in known_claim_ids:
            known_claim_ids.append(claim["rumourID"])
            known_claims[str(claim["rumourID"])] = claim
    
    tweet_query["shortlist"] = [claim["rumourID"] for claim in shortlist]

In [None]:
print(len(known_claims))

# 5. Produce JSONs for the annotation form generator

In [None]:
import json

tweet_sample = [{"tweetID": str(tweet_query["id"]), "text": tweet_query["text"], "rumourShortlist": tweet_query["shortlist"]} 
                for tweet_query in tweet_queries]
print(json.dumps({"tweetSample": tweet_sample}))

In [None]:
print(json.dumps(known_claims))

# 6. Use the annotation form generator


Copy the JSONs from the above cell outputs into the form generator here: https://script.google.com/a/macros/sheffield.ac.uk/s/AKfycbyVFaLRlZrgQYsZTGPbBirRA6maY5WD1CGSZOHPS31l5ThHxQxoDtgIQssYXUSTVl3r/exec