**LAB 3: Ranking**

Harrison Lian U196989

Hugo Da Silva U191838

Brayan González U172820

**IMPORTS**

In [1]:
import nltk
nltk.download('stopwords')
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import json
import re
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Harrison\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**AUX FUNCTIONS**

In [2]:
def pretty_print_tweet(tweet):
    print(
    """id: {}
    username: {}
    text: {}
    date: {}
    hashtags: {}
    likes: {}
    retweets: {}
    url: {}\n""".format(tweet['id'], tweet['username'], tweet['full_text'], tweet['date'], tweet['hashtags'], tweet['likes'],
                        tweet['retweets'], tweet['url']))

# **LOAD DATA AND CLEAN UP**

In [3]:
path = 'dataset_tweets_WHO.txt'

#convert the text to json
with open(path) as f:
    tweets_json = json.load(f)

In [4]:
print(json.dumps(tweets_json['50'], indent=4, sort_keys=True))

{
    "contributors": null,
    "coordinates": null,
    "created_at": "Mon Oct 11 04:43:20 +0000 2021",
    "display_text_range": [
        0,
        140
    ],
    "entities": {
        "hashtags": [],
        "symbols": [],
        "urls": [],
        "user_mentions": [
            {
                "id": 3794682452,
                "id_str": "3794682452",
                "indices": [
                    3,
                    11
                ],
                "name": "World Health Organization (WHO) Western Pacific",
                "screen_name": "WHOWPRO"
            }
        ]
    },
    "favorite_count": 0,
    "favorited": false,
    "full_text": "RT @WHOWPRO: \u201cMy patients are no different to my grandmother and grandfather.\u201d \n\nLoyal to their oath, health workers like Dr Gantsengel Pur\u2026",
    "geo": null,
    "id": 1447422540335484932,
    "id_str": "1447422540335484932",
    "in_reply_to_screen_name": null,
    "in_reply_to_status_id": null,
    "in_repl

In [5]:
def remove_punct(line):
    """
    Helper function to remove punctuation EXCEPT for '#''
    
    Arugment:
    line -- string of text
    
    Returns:
    line -- string of text without punctuation
    """
    return line.translate(str.maketrans('', '', string.punctuation.replace('#', '')))

def build_terms(line):
    """
    Preprocess the Tweet text by removing stop words, emojis, and punctuation and
    stemming, transforming to lowercase and returning the tokens of the text.
    
    Argument:
    line -- string (text) to be preprocessed
    
    Returns:
    line -- a list of tokens corresponding to the input text after the preprocessing
    """

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    
    # transform to lowercase 
    line =  line.lower() 
    
    # remove non-ASCII terms like emojis and symbols
    line = "".join(c for c in line if c in string.printable) 
    
    # remove punctuation EXCEPT for hashtags (see remove_punct())
    line = remove_punct(line)
    
    # tokenize the text to get a list of terms
    line = line.split() 
    
    # remove html tags, blank spaces like '', and urls
    line = [word for word in line if not (re.match("^qampa$" , word) or re.match("^amp$" , word) or re.match("^http" , word)) 
    and word] 
    
    # remove standalone numbers e.x. '19' but not the 19 from 'covid19'
    line = [word for word in line if not word.isnumeric()]
    
    # add standalone word as token too if it has number e.x. 'covid19' gets tokenized as 'covid19' and 'covid'
    line = line + [word.rstrip(string.digits) for word in line if sum([c.isdigit() for c in word]) != 0]
    
    # remove stopwords
    line = [word for word in line if word not in stop_words] 
    
    # perform stemming
    line = [stemmer.stem(word) for word in line]
    
    # add unhashtagged word if it's hashtag is present 
    # e.x. if #covid is present, we also add covid as a token
    line = line + [word.replace('#', '') for word in line if word[0] == '#' ] 
    
    return line

In [6]:
# tweet_dict is our output data structure that maps Tweet IDs to their text
# note we need to keep the following information
# Tweet | Username | Date | Hashtags | Likes | Retweets | Url

def create_tweets(tweets_json):
    tweet_dict = defaultdict()
    tweets = []

    for key in tweets_json:
        tweet_data = {
            'id': tweets_json[key]['id'],
            'full_text': tweets_json[key]['full_text'],
            'tokens': build_terms(tweets_json[key]['full_text']),
            'username': tweets_json[key]['user']['name'],
            'date': tweets_json[key]['created_at'],
            'hashtags': [key['text'] for key in tweets_json[key]['entities']['hashtags']],
            'likes': tweets_json[key]['favorite_count'],
            'retweets': tweets_json[key]['retweet_count'], 
        }

        #sometimes the tweet url doesn't exist
        try:
            tweet_data['url'] = tweets_json[key]['entities']['media'][0]['url']
        except:
            tweet_data['url'] = None
        
        tweets.append(tweet_data)
    return tweets

# **CREATE INDEXES**

In [7]:
# create index
def create_index(tweets_json):
    tweets = create_tweets(tweets_json)
    index = defaultdict(list)
    title_index = defaultdict()

    for tweet in tweets:
        title_index[tweet['id']] = tweet
        
        #current page index keeps track of postision of each word in tweet
        #e.x. if our tweet #50 has tokens "covid health world covid", our current_page_index looks like:
        # {covid -> [50, [0, 3]], health -> [50, [1]], world [50, [2]]}
        current_page_index = {}
        for position, word in enumerate(tweet['tokens']):
            
            try:
                # if the term is already in the index for the current page (current_page_index)
                # append the position to the corresponding list
                current_page_index[word][1].append(position)  
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_page_index[word]=[tweet['id'], array('I', [position])] #'I' indicates unsigned int (int in Python)
        

        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)
    
    return index, title_index

In [8]:
# apply tf-idf
# tweets is a list of tokens
def create_tfidf_index(tweets):
    index = defaultdict(list)
    tf = defaultdict(list)  #term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int)  #document frequencies of terms in the corpus
    title_index = defaultdict()
    idf = defaultdict(float)


    for tweet in tweets:
        
        title_index[tweet['id']] = tweet
        current_page_index = {}

        for position, term in enumerate(tweet['tokens']):  ## terms contains page_title + page_text
            try:
                # if the term is already in the dict append the position to the corresponding list
                current_page_index[term][1].append(position)
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_page_index[term]=[tweet['id'], array('I',[position])] #'I' indicates unsigned int (int in Python)

        #normalize term frequencies
        # Compute the denominator to normalize term frequencies (formula 2 above)
        # norm is the same for all terms of a document.
        norm = 0
        for term, posting in current_page_index.items():
            # posting will contain the list of positions for current term in current document. 
            # posting ==> [current_doc, [list of positions]] 
            # you can use it to infer the frequency of current term.
            
            #CHECK THIS!
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        #calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, posting in current_page_index.items():
            # append the tf for current term (tf = term frequency in current doc/norm)
            tf[term].append(np.round(len(posting[1])/norm, 4)) ## SEE formula (1) above
            #increment the document frequency of current term (number of documents containing the current term)
            df[term] += 1 # increment DF for current term

        #merge the current page index with the main index
        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)

        # Compute IDF following the formula (3) above. HINT: use np.log
        for term in df:
            idf[term] = 1 + np.round(np.log(float(len(tweets)/df[term])), 4)

    return index, tf, df, idf, title_index

# **RANK TF-IDF + COS SIMILARITY; CUSTOM RANKING + TF-COS SIMILARITY**

In [9]:
def search_ranking(query, index, mode = 'TF-IDF'):
    """
    output is the list of documents that contain any of the query terms. 
    So, we will get the list of documents for each query term, and take the union of them.
    """
    query = build_terms(query)
    docs = set()
    for term in query:
        try:
            # store in term_docs the ids of the docs that contain "term"                        
            term_docs=[posting[0] for posting in index[term]]
            
            # docs = docs Union term_docs
            docs = docs.union(set(term_docs))
            #docs = set(term_docs)
            #print(docs)
        except:
            #term is not in index
            pass
        

    docs = list(docs)
    if mode == 'TF-IDF':
        ranked_docs, pred_score = rank_documents_tfidf_cos(query, docs, index, idf, tf, title_index)
    else:
        ranked_docs, pred_score = rank_documents_bm25_custom(query, docs, index, tf, title_index)
    return ranked_docs, pred_score

In [10]:
def rank_documents_tfidf_cos(terms, docs, index, idf, tf, title_index):
    """
    Perform the ranking of the results of a search based on the tf-idf weights
    
    Argument:
    terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    title_index -- mapping between page id and page title
    
    Returns:
    Print the list of ranked documents
    """

    # I'm interested only on the element of the docVector corresponding to the query terms 
    # The remaining elements would became 0 when multiplied to the query_vector
    # I call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    doc_vectors = defaultdict(lambda: [0] * len(terms)) 
    query_vector = [0] * len(terms)

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query. 
    # Example: collections.Counter(["hello","hello","world"]) --> Counter({'hello': 2, 'world': 1})
    #HINT: use when computing tf for query_vector

    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        ## Compute tf*idf(normalize TF as done with documents)
        query_vector[termIndex] = query_terms_count[term]/query_norm * idf[term] 

        # Generate doc_vectors for matching docs
        for doc_index, (doc, postings) in enumerate(index[term]):
            # Example of [doc_index, (doc, postings)]
            # 0 (26, array('I', [1, 4, 12, 15, 22, 28, 32, 43, 51, 68, 333, 337]))
            # 1 (33, array('I', [26, 33, 57, 71, 87, 104, 109]))
            # term is in doc 26 in positions 1,4, .....
            # term is in doc 33 in positions 26,33, .....

            #tf[term][0] will contain the tf of the term "term" in the doc 26            
            if doc in docs:
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term]  

    # Calculate the score of each doc 
    # compute the cosine similarity between queyVector and each docVector:
    
    doc_scores=[[np.dot(curDocVec, query_vector) / (np.linalg.norm(curDocVec) * np.linalg.norm(query_vector)), doc] for doc, curDocVec in doc_vectors.items() ]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    result_pred_score = [x[0] for x in doc_scores]

    #print document titles instead if document id's
    #result_docs=[ title_index[x] for x in result_docs ]
    if len(result_docs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_ranking(query, index)
    #print ('\n'.join(result_docs), '\n')
    return result_docs, result_pred_score

In [11]:
def rank_documents_custom(terms, docs, index, tf, title_index):    

    doc_vectors = defaultdict(lambda: [0] * len(terms)) 
    query_vector = [0] * len(terms)

    query_terms_count = collections.Counter(terms)
    #query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        ## Compute tf*idf(normalize TF as done with documents)
        query_vector[termIndex] = query_terms_count[term] #/ query_norm 

        for doc_index, (doc, postings) in enumerate(index[term]):
            #tf[term][0] will contain the tf of the term "term" in the doc 26            
            if doc in docs:
                doc_vectors[doc][termIndex] = tf[term][doc_index] 
    

    # Calculate the score of each doc 
    # compute the cosine similarity between queyVector and each docVector
    # weight the score based on likes and retweets

    doc_scores=[[(np.dot(curDocVec, query_vector) / (np.linalg.norm(curDocVec) * np.linalg.norm(query_vector)))*0.7 + ((int(title_index[doc]['likes']) * 0.4) + int(title_index[doc]['retweets']) * 0.6)*0.3, doc] for doc, curDocVec in doc_vectors.items() ]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    result_pred_score = [x[0] for x in doc_scores]



    return result_docs, result_pred_score

In [12]:
from rank_bm25 import BM25Okapi
def rank_documents_bm25_custom(terms, docs, index, tf, title_index):
    
    #we use external library to calculate the bm25
    #all algorithms are in this paper http://www.cs.otago.ac.nz/homepages/andrew/papers/2014-2.pdf
    try:
        tokenized_corpus = [title_index[doc_id]['tokens'] for doc_id in docs]
    except:
        print(docs)
    bm25_model = BM25Okapi(tokenized_corpus)
    
    #bm25_score calculates the bm25 score for each documents, given the query vector 'terms'
    bm25_score = bm25_model.get_scores(terms)
    
    #we will now calculate our custom score
    updated_results = []
    for i in range(len(docs)):
        curr_bm25 = bm25_score[i]
        tweet = docs[i]
        
        #initialize the metrics that we will use to calculate custom score
        #explanations and motivations are in the writeup
        
        #curr_length_hashtag_ratio is 1 + log(len(tweet)/(num of hashtags))
        curr_length_hashtag_ratio = 1
        
        #curr_num_likes is 1 + log(len(likes))
        curr_num_likes = 1
        
        #curr_num_retweets is 1 + log(len(retweets))
        curr_retweets = 1
        
        #multiple try clauses in case we divide by 0 or take the logarithm of 0
        try:
            curr_length_hashtag_ratio = 1 + np.log(len(tweet['tokens'])/len(tweet['hashtags']))
        except:
            pass
        
        try:
            curr_num_likes = 1 + np.log(int(tweet['likes']))
        except:
            pass
        
        try:
            curr_retweets = 1 + np.log(int(tweet['retweets']))
            score = curr_bm25 * curr_length_hashtag_ratio * curr_num_likes * curr_retweets
            updated_results.append(score)
        except:
            pass
        return docs, updated_results
    
        
"""
def test():
    terms = ['windy', 'london']
    tweet1 = {'tokens': ['windy', 'city']}
    tweet2 = {'tokens': ['how','is','windy', 'london','today']}
    tweet3 = {'tokens': ['this', 'tweet','is']}
    docs = [tweet1, tweet2, tweet3]
    index = 0
    tf = 0
    title_index = 0
    return rank_documents_bm25_custom(terms, docs, index, tf, title_index)
print(test())
"""

"\ndef test():\n    terms = ['windy', 'london']\n    tweet1 = {'tokens': ['windy', 'city']}\n    tweet2 = {'tokens': ['how','is','windy', 'london','today']}\n    tweet3 = {'tokens': ['this', 'tweet','is']}\n    docs = [tweet1, tweet2, tweet3]\n    index = 0\n    tf = 0\n    title_index = 0\n    return rank_documents_bm25_custom(terms, docs, index, tf, title_index)\nprint(test())\n"

In [13]:

tweets = create_tweets(tweets_json)
index, tf, df, idf, title_index = create_tfidf_index(tweets)

In [14]:
print("Insert your query (i.e.: Computer Science):\n")
query = input()
ranked_docs, _ = search_ranking(query, index, 'custom')

top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(ranked_docs)))
count = 1
#print(ranked_docs[0])
for d_id in ranked_docs[:top]:
    print("rank: {}".format(count))
    #print(d_id)
    #print(title_index)
    pretty_print_tweet(title_index[d_id])
    count += 1


Insert your query (i.e.: Computer Science):

covid 19 pandemic

Top 20 results out of 814 for the searched query:

rank: 1
id: 1448208458604584960
    username: World Health Organization (WHO)
    text: #COVID19 has shown how health emergencies and disasters affect entire communities – especially those with weak health systems, and vulnerable populations like migrants, indigenous peoples, and those living in fragile humanitarian conditions. https://t.co/jpUQpnu0V1
    date: Wed Oct 13 08:46:17 +0000 2021
    hashtags: ['COVID19']
    likes: 119
    retweets: 33
    url: https://t.co/jpUQpnu0V1

rank: 2
id: 1448163383493136385
    username: World Health Organization (WHO)
    text: RT @opsoms: Si está completamente vacunado 💉💉, ¿aún puede contraer COVID-19? 

🚨 No importa si está vacunado o si todavía está esperando, s…
    date: Wed Oct 13 05:47:10 +0000 2021
    hashtags: []
    likes: 0
    retweets: 43
    url: None

rank: 3
id: 1448031156348362754
    username: World Health Organiz

In [15]:
###word2vec
import pandas as pd
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import preprocess_string
import nltk
#create a word2Vec model
words = [tweet['tokens'] for tweet in tweets]
print(words[:2])
w2v_model = Word2Vec(sentences = words, size = 100, window = 10, min_count = 10, negative = 15, sg = 0)
query = 'covid'
w2v_model.wv.most_similar(query)

[['intern', 'day', 'disast', 'risk', 'reduct', '#openwho', 'launch', 'multiti', 'core', 'curriculum', 'help', 'equip', 'compet', 'need', 'work', 'within', 'public', 'health', 'emerg', 'respons', 'start', 'learn', 'today', '#ready4respons', '#ready4respons', 'openwho', 'ready4respons', 'ready4respons'], ['#covid19', 'shown', 'health', 'emerg', 'disast', 'affect', 'entir', 'commun', 'especi', 'weak', 'health', 'system', 'vulner', 'popul', 'like', 'migrant', 'indigen', 'peopl', 'live', 'fragil', 'humanitarian', 'condit', '#covid', 'covid19', 'covid']]


[('#covid', 0.9992499947547913),
 ('covid19', 0.9991809725761414),
 ('vaccinequ', 0.9983397126197815),
 ('#vaccinequ', 0.998103141784668),
 ('askwho', 0.9964108467102051),
 ('dr', 0.9963833093643188),
 ('mvankerkhov', 0.9962170124053955),
 ('#covid19', 0.9960458874702454),
 ('variant', 0.9959359169006348),
 ('end', 0.9959125518798828)]