In [1]:
import nltk
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import json
import re
import string

In [2]:
path = 'dataset_tweets_WHO.txt'

#convert the text to json
with open(path) as f:
    tweets_json = json.load(f)

In [3]:
print(json.dumps(tweets_json['50'], indent=4, sort_keys=True))

{
    "contributors": null,
    "coordinates": null,
    "created_at": "Mon Oct 11 04:43:20 +0000 2021",
    "display_text_range": [
        0,
        140
    ],
    "entities": {
        "hashtags": [],
        "symbols": [],
        "urls": [],
        "user_mentions": [
            {
                "id": 3794682452,
                "id_str": "3794682452",
                "indices": [
                    3,
                    11
                ],
                "name": "World Health Organization (WHO) Western Pacific",
                "screen_name": "WHOWPRO"
            }
        ]
    },
    "favorite_count": 0,
    "favorited": false,
    "full_text": "RT @WHOWPRO: \u201cMy patients are no different to my grandmother and grandfather.\u201d \n\nLoyal to their oath, health workers like Dr Gantsengel Pur\u2026",
    "geo": null,
    "id": 1447422540335484932,
    "id_str": "1447422540335484932",
    "in_reply_to_screen_name": null,
    "in_reply_to_status_id": null,
    "in_repl

In [4]:
def remove_punct(line):
    """
    Helper function to remove punctuation EXCEPT for '#''
    
    Arugment:
    line -- string of text
    
    Returns:
    line -- string of text without punctuation
    """
    return line.translate(str.maketrans('', '', string.punctuation.replace('#', '')))

def build_terms(line):
    """
    Preprocess the Tweet text by removing stop words, emojis, and punctuation and
    stemming, transforming to lowercase and returning the tokens of the text.
    
    Argument:
    line -- string (text) to be preprocessed
    
    Returns:
    line -- a list of tokens corresponding to the input text after the preprocessing
    """

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    
    # transform to lowercase 
    line =  line.lower() 
    
    # remove non-ASCII terms like emojis and symbols
    line = "".join(c for c in line if c in string.printable) 
    
    # remove punctuation EXCEPT for hashtags (see remove_punct())
    line = remove_punct(line)
    
    # tokenize the text to get a list of terms
    line = line.split() 
    
    # remove html tags, blank spaces like '', and urls
    line = [word for word in line if not (re.match("^qampa$" , word) or re.match("^amp$" , word) or re.match("^http" , word)) 
    and word] 
    
    # remove standalone numbers e.x. '19' but not the 19 from 'covid19'
    line = [word for word in line if not word.isnumeric()]
    
    # remove stopwords
    line = [word for word in line if word not in stop_words] 
    
    # perform stemming
    line = [stemmer.stem(word) for word in line]
    
    # add unhashtagged word if it's hashtag is present 
    # e.x. if #covid is present, we also add covid as a token
    line = line + [word.replace('#', '') for word in line if word[0] == '#' ] 
    
    return line

In [5]:
# tweet_dict is our output data structure that maps Tweet IDs to their text
# note we need to keep the following information
# Tweet | Username | Date | Hashtags | Likes | Retweets | Url

def create_tweets(tweets_json):
    tweet_dict = defaultdict()
    tweets = []

    for key in tweets_json:
        tweet_data = {
            'id': tweets_json[key]['id'],
            'full_text': tweets_json[key]['full_text'],
            'tokens': build_terms(tweets_json[key]['full_text']),
            'username': tweets_json[key]['user']['name'],
            'date': tweets_json[key]['created_at'],
            'hashtags': [key['text'] for key in tweets_json[key]['entities']['hashtags']],
            'likes': tweets_json[key]['favorite_count'],
            'retweets': tweets_json[key]['retweet_count'], 
        }

        #sometimes the tweet url doesn't exist
        try:
            tweet_data['url'] = tweets_json[key]['entities']['media'][0]['url']
        except:
            tweet_data['url'] = None
        
        tweets.append(tweet_data)
    return tweets


In [6]:
# create index
def create_index(tweets_json):
    tweets = create_tweets(tweets_json)
    index = defaultdict(list)
    title_index = defaultdict()

    for tweet in tweets:
        title_index[tweet['id']] = tweet
        
        #current page index keeps track of postision of each word in tweet
        #e.x. if our tweet #50 has tokens "covid health world covid", our current_page_index looks like:
        # {covid -> [50, [0, 3]], health -> [50, [1]], world [50, [2]]}
        current_page_index = {}
        for position, word in enumerate(tweet['tokens']):
            
            try:
                # if the term is already in the index for the current page (current_page_index)
                # append the position to the corresponding list
                current_page_index[word][1].append(position)  
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_page_index[word]=[tweet['id'], array('I', [position])] #'I' indicates unsigned int (int in Python)
        

        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)
    
    return index, title_index


In [32]:
# apply tf-idf
# tweets is a list of tokens
def create_tfidf_index(tweets):
    index = defaultdict(list)
    tf = defaultdict(list)  #term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int)  #document frequencies of terms in the corpus
    title_index = defaultdict()
    idf = defaultdict(float)


    for tweet in tweets:
        
        title_index[tweet['id']] = tweet
        current_page_index = {}

        for position, term in enumerate(tweet['tokens']):  ## terms contains page_title + page_text
            try:
                # if the term is already in the dict append the position to the corresponding list
                current_page_index[term][1].append(position)
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_page_index[term]=[tweet['id'], array('I',[position])] #'I' indicates unsigned int (int in Python)

        #normalize term frequencies
        # Compute the denominator to normalize term frequencies (formula 2 above)
        # norm is the same for all terms of a document.
        norm = 0
        for term, posting in current_page_index.items():
            # posting will contain the list of positions for current term in current document. 
            # posting ==> [current_doc, [list of positions]] 
            # you can use it to infer the frequency of current term.
            
            #CHECK THIS!
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        #calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, posting in current_page_index.items():
            # append the tf for current term (tf = term frequency in current doc/norm)
            tf[term].append(np.round(len(posting[1])/norm, 4)) ## SEE formula (1) above
            #increment the document frequency of current term (number of documents containing the current term)
            df[term] += 1 # increment DF for current term

        #merge the current page index with the main index
        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)

        # Compute IDF following the formula (3) above. HINT: use np.log
        for term in df:
            idf[term] = 1 + np.round(np.log(float(len(tweets)/df[term])), 4)

    return index, tf, df, idf, title_index

def test():
    tweet1 = {'tokens' : ['covid', 'health', 'world', 'covid'], 'id' : 50}
    tweet2 = {'tokens' : ['covid', 'medicine', 'dog', 'world'], 'id' : 60}
    tweet3 = {'tokens' : ['covid', 'health', 'dog', 'ugh', 'huh'], 'id' : 2}
    tweets = [tweet1, tweet2, tweet3]
    index, tf, df, idf, title_index = create_tfidf_index(tweets)
    print(index)
    print(tf)
    print(df)
    print(idf)
    print(title_index)
test()

defaultdict(<class 'list'>, {'covid': [[50, array('I', [0, 3])], [60, array('I', [0])], [2, array('I', [0])]], 'health': [[50, array('I', [1])], [2, array('I', [1])]], 'world': [[50, array('I', [2])], [60, array('I', [3])]], 'medicine': [[60, array('I', [1])]], 'dog': [[60, array('I', [2])], [2, array('I', [2])]], 'ugh': [[2, array('I', [3])]], 'huh': [[2, array('I', [4])]]})
defaultdict(<class 'list'>, {'covid': [0.8165, 0.5, 0.4472], 'health': [0.4082, 0.4472], 'world': [0.4082, 0.5], 'medicine': [0.5], 'dog': [0.5, 0.4472], 'ugh': [0.4472], 'huh': [0.4472]})
defaultdict(<class 'int'>, {'covid': 3, 'health': 2, 'world': 2, 'medicine': 1, 'dog': 2, 'ugh': 1, 'huh': 1})
defaultdict(<class 'float'>, {'covid': 1.0, 'health': 1.4055, 'world': 1.4055, 'medicine': 2.0986000000000002, 'dog': 1.4055, 'ugh': 2.0986000000000002, 'huh': 2.0986000000000002})
defaultdict(None, {50: {'tokens': ['covid', 'health', 'world', 'covid'], 'id': 50}, 60: {'tokens': ['covid', 'medicine', 'dog', 'world'], 'i

In [8]:
def rank_documents(terms, docs, index, idf, tf, title_index):
    """
    Perform the ranking of the results of a search based on the tf-idf weights
    
    Argument:
    terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    title_index -- mapping between page id and page title
    
    Returns:
    Print the list of ranked documents
    """

    # I'm interested only on the element of the docVector corresponding to the query terms 
    # The remaining elements would became 0 when multiplied to the query_vector
    doc_vectors = defaultdict(lambda: [0] * len(terms)) # I call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query. 
    # Example: collections.Counter(["hello","hello","world"]) --> Counter({'hello': 2, 'world': 1})
    #HINT: use when computing tf for query_vector

    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        ## Compute tf*idf(normalize TF as done with documents)
        query_vector[termIndex] = query_terms_count[term]/query_norm * idf[term] 

        # Generate doc_vectors for matching docs
        for doc_index, (doc, postings) in enumerate(index[term]):
            # Example of [doc_index, (doc, postings)]
            # 0 (26, array('I', [1, 4, 12, 15, 22, 28, 32, 43, 51, 68, 333, 337]))
            # 1 (33, array('I', [26, 33, 57, 71, 87, 104, 109]))
            # term is in doc 26 in positions 1,4, .....
            # term is in doc 33 in positions 26,33, .....

            #tf[term][0] will contain the tf of the term "term" in the doc 26            
            if doc in docs:
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term]  # TODO: check if multiply for idf

    # Calculate the score of each doc 
    # compute the cosine similarity between queyVector and each docVector:
    # HINT: you can use the dot product because in case of normalized vectors it corresponds to the cosine similarity
    # see np.dot
    
    doc_scores=[[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items() ]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    #print document titles instead if document id's
    #result_docs=[ title_index[x] for x in result_docs ]
    if len(result_docs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_tf_idf(query, index)
    #print ('\n'.join(result_docs), '\n')
    return result_docs

In [9]:
def search_tf_idf(query, index):
    """
    output is the list of documents that contain any of the query terms. 
    So, we will get the list of documents for each query term, and take the union of them.
    """
    query = build_terms(query)
    docs = set()
    for term in query:
        try:
            # store in term_docs the ids of the docs that contain "term"                        
            term_docs=[posting[0] for posting in index[term]]
            
            # docs = docs Union term_docs
            docs = set(term_docs)
        except:
            #term is not in index
            pass
    docs = list(docs)
    ranked_docs = rank_documents(query, docs, index, idf, tf, title_index)
    return ranked_docs

In [10]:
import time
tweets = create_tweets(tweets_json)
start_time = time.time()
index, tf, df, idf, title_index = create_tfidf_index(tweets)
print("Total time to create the index: {} seconds".format(np.round(time.time() - start_time, 2)))

Total time to create the index: 85.14 seconds


In [30]:
def pretty_print_tweet(tweet):
    print(
    """id: {}
    username: {}
    text: {}
    date: {}
    hashtags: {}
    likes: {}
    retweets: {}
    url: {}\n""".format(tweet['id'], tweet['username'], tweet['full_text'], tweet['date'], tweet['hashtags'], tweet['likes'],
                        tweet['retweets'], tweet['url']))

In [31]:
print("Insert your query (i.e.: Computer Science):\n")
query = input()
ranked_docs = search_tf_idf(query, index)
top = 10

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(ranked_docs)))
count = 1
for d_id in ranked_docs[:top]:
    print("rank: {}".format(count))
    pretty_print_tweet(title_index[d_id])
    count += 1

Insert your query (i.e.: Computer Science):

sexual

Top 10 results out of 45 for the searched query:

rank: 1
id: 1434063654568222720
    username: World Health Organization (WHO)
    text: ✅A positive and respectful approach to sexuality &amp; sexual relationships 
✅The possibility of pleasurable &amp; safe sexual experiences
✅Being free of sexual coercion, discrimination &amp; violence 

All are 🔑 to sexual health and well-being 
https://t.co/W4RjDBdwl7 https://t.co/M9KuIYBObM
    date: Sat Sep 04 07:59:53 +0000 2021
    hashtags: []
    likes: 303
    retweets: 73
    url: https://t.co/M9KuIYBObM

rank: 2
id: 1447584608657317889
    username: World Health Organization (WHO)
    text: Digital media can contribute to education, counselling and care related to sexuality, sexual identity, and sexual relationships among adolescent girls.

Learn more about WHO's work on out-of-school comprehensive sexuality education 👉https://t.co/79fBeDZFOx

#DayOfTheGirl https://t.co/SlbVrSdWFC
    dat

# Evaluation

In [26]:
# we'll print terms with highest df, tf, and idf
print(sorted(df.items(), key=lambda x: x[1], reverse=True)[:50], "\n")
print(sorted(tf.items(), key=lambda x: x[1], reverse=True)[:50], "\n")
print(sorted(idf.items(), key=lambda x: x[1], reverse=True)[:50], "\n")

[('drtedro', 942), ('covid19', 735), ('#covid19', 723), ('health', 598), ('rt', 549), ('vaccin', 433), ('countri', 338), ('peopl', 305), ('support', 246), ('pandem', 233), ('global', 229), ('need', 227), ('live', 201), ('#vaccinequ', 195), ('vaccinequ', 195), ('world', 154), ('help', 151), ('care', 151), ('access', 138), ('year', 138), ('work', 125), ('use', 125), ('new', 117), ('today', 116), ('diseas', 115), ('emerg', 114), ('death', 112), ('risk', 111), ('provid', 109), ('servic', 108), ('includ', 108), ('continu', 106), ('call', 105), ('million', 105), ('prevent', 105), ('actacceler', 103), ('protect', 101), ('everi', 98), ('one', 98), ('safe', 96), ('must', 94), ('end', 93), ('also', 93), ('make', 92), ('respons', 91), ('suppli', 91), ('share', 89), ('around', 88), ('develop', 83), ('time', 82)] 

[('pill', [0.7071, 0.1961, 0.1857]), ('sexual', [0.6489, 0.1961, 0.1768, 0.25, 0.2236, 0.2, 0.1796, 0.1961, 0.189, 0.2357, 0.1857, 0.2, 0.3482, 0.2182, 0.2582, 0.1857, 0.2, 0.2085, 0.267