In [1]:
import nltk
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import json
import re
import string

In [2]:
path = 'dataset_tweets_WHO.txt'

#convert the text to json
with open(path) as f:
    tweets_json = json.load(f)

In [3]:
print(json.dumps(tweets_json['50'], indent=4, sort_keys=True))

{
    "contributors": null,
    "coordinates": null,
    "created_at": "Mon Oct 11 04:43:20 +0000 2021",
    "display_text_range": [
        0,
        140
    ],
    "entities": {
        "hashtags": [],
        "symbols": [],
        "urls": [],
        "user_mentions": [
            {
                "id": 3794682452,
                "id_str": "3794682452",
                "indices": [
                    3,
                    11
                ],
                "name": "World Health Organization (WHO) Western Pacific",
                "screen_name": "WHOWPRO"
            }
        ]
    },
    "favorite_count": 0,
    "favorited": false,
    "full_text": "RT @WHOWPRO: \u201cMy patients are no different to my grandmother and grandfather.\u201d \n\nLoyal to their oath, health workers like Dr Gantsengel Pur\u2026",
    "geo": null,
    "id": 1447422540335484932,
    "id_str": "1447422540335484932",
    "in_reply_to_screen_name": null,
    "in_reply_to_status_id": null,
    "in_repl

In [4]:
def remove_punct(line):
    """
    Helper function to remove punctuation EXCEPT for '#''
    
    Arugment:
    line -- string of text
    
    Returns:
    line -- string of text without punctuation
    """
    return line.translate(str.maketrans('', '', string.punctuation.replace('#', '')))

def build_terms(line):
    """
    Preprocess the Tweet text by removing stop words, emojis, and punctuation and
    stemming, transforming to lowercase and returning the tokens of the text.
    
    Argument:
    line -- string (text) to be preprocessed
    
    Returns:
    line -- a list of tokens corresponding to the input text after the preprocessing
    """

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    
    # transform to lowercase 
    line =  line.lower() 
    
    # remove non-ASCII terms like emojis and symbols
    line = "".join(c for c in line if c in string.printable) 
    
    # remove punctuation EXCEPT for hashtags (see remove_punct())
    line = remove_punct(line)
    
    # tokenize the text to get a list of terms
    line = line.split() 
    
    # remove html tags, blank spaces like '', and urls
    line = [word for word in line if not (re.match("^qampa$" , word) or re.match("^amp$" , word) or re.match("^http" , word)) 
    and word] 
    
    # remove standalone numbers e.x. '19' but not the 19 from 'covid19'
    line = [word for word in line if not word.isnumeric()]
    
    # remove stopwords
    line = [word for word in line if word not in stop_words] 
    
    # perform stemming
    line = [stemmer.stem(word) for word in line]
    
    # add unhashtagged word if it's hashtag is present 
    # e.x. if #covid is present, we also add covid as a token
    line = line + [word.replace('#', '') for word in line if word[0] == '#' ] 
    
    return line

In [5]:
# tweet_dict is our output data structure that maps Tweet IDs to their text
# note we need to keep the following information
# Tweet | Username | Date | Hashtags | Likes | Retweets | Url

def create_tweets(tweets_json):
    tweet_dict = defaultdict()
    tweets = []

    for key in tweets_json:
        tweet_data = {
            'id': tweets_json[key]['id'],
            'full_text': tweets_json[key]['full_text'],
            'tokens': build_terms(tweets_json[key]['full_text']),
            'username': tweets_json[key]['user']['name'],
            'date': tweets_json[key]['created_at'],
            'hashtags': [key['text'] for key in tweets_json[key]['entities']['hashtags']],
            'likes': tweets_json[key]['favorite_count'],
            'retweets': tweets_json[key]['retweet_count'], 
        }

        #sometimes the tweet url doesn't exist
        try:
            tweet_data['url'] = tweets_json[key]['entities']['media'][0]['url']
        except:
            tweet_data['url'] = None
        
        tweets.append(tweet_data)
    return tweets


In [15]:
# create index
def create_index(tweets_json):
    tweets = create_tweets(tweets_json)
    index = defaultdict(list)

    for tweet in tweets:
        #title_index[tweet['id']] = tweet['full_text']
        current_page_index = {}
        for position, word in enumerate(tweet['tokens']):
            
            try:
                # if the term is already in the index for the current page (current_page_index)
                # append the position to the corresponding list
                current_page_index[word][1].append(position)  
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_page_index[word]=[tweet['id'], array('I', [position])] #'I' indicates unsigned int (int in Python)
        

        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)
    
    return index


[[1448215930178310144, array('I', [0])], [1448195167048118274, array('I', [0])], [1447557052147146752, array('I', [1])], [1444284551728033793, array('I', [0])], [1443984064441114628, array('I', [0])], [1443924928684826632, array('I', [0])], [1443913395586142209, array('I', [1])], [1443887511395356672, array('I', [1])], [1443534762132582404, array('I', [0])], [1442930655902322688, array('I', [0])], [1442849339349651456, array('I', [18])], [1442848994103898113, array('I', [13])], [1442836276307275776, array('I', [13])], [1442493845028102145, array('I', [6, 13])], [1442474165735407616, array('I', [7])], [1441463208620011520, array('I', [6])], [1440973666855043074, array('I', [10])], [1440923415117017090, array('I', [0])], [1439194773408071682, array('I', [2])], [1437741809753538561, array('I', [8])], [1437436684023615495, array('I', [9])], [1437436381479964672, array('I', [4])], [1435613091987927042, array('I', [8])], [1435538492503863299, array('I', [0])], [1435309781078712322, array('I'

In [19]:
# apply tf-idf
def create_tfidf_index(tweets_json):
    index = defaultdict(list)
    tf = defaultdict(list)  #term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int)  #document frequencies of terms in the corpus
    title_index = defaultdict(str)
    idf = defaultdict(float)
    
    tweet = create_tweets(tweets_json)

    for tweet in tweets:
    
        current_page_index = {}

        for position, term in enumerate(tweet['tokens']):  ## terms contains page_title + page_text
            try:
                # if the term is already in the dict append the position to the corresponding list
                current_page_index[term][1].append(position)
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_page_index[term]=[tweet['id'], array('I',[position])] #'I' indicates unsigned int (int in Python)

        #normalize term frequencies
        # Compute the denominator to normalize term frequencies (formula 2 above)
        # norm is the same for all terms of a document.
        norm = 0
        for term, posting in current_page_index.items():
            # posting will contain the list of positions for current term in current document. 
            # posting ==> [current_doc, [list of positions]] 
            # you can use it to infer the frequency of current term.
            
            #CHECK THIS!
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        #calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, posting in current_page_index.items():
            # append the tf for current term (tf = term frequency in current doc/norm)
            tf[term].append(np.round(len(posting)/norm, 4)) ## SEE formula (1) above
            #increment the document frequency of current term (number of documents containing the current term)
            df[term] += 1 # increment DF for current term

        #merge the current page index with the main index
        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)

        # Compute IDF following the formula (3) above. HINT: use np.log
        for term in df:
            idf[term] = np.round(np.log(float(len(tweets)/df[term])), 4)

    return index, tf, df, idf, title_index

print(create_tfidf_index(tweets_json))

(defaultdict(<class 'list'>, {'intern': [[1448215930178310144, array('I', [0])], [1448195167048118274, array('I', [0])], [1447557052147146752, array('I', [1])], [1444284551728033793, array('I', [0])], [1443984064441114628, array('I', [0])], [1443924928684826632, array('I', [0])], [1443913395586142209, array('I', [1])], [1443887511395356672, array('I', [1])], [1443534762132582404, array('I', [0])], [1442930655902322688, array('I', [0])], [1442849339349651456, array('I', [18])], [1442848994103898113, array('I', [13])], [1442836276307275776, array('I', [13])], [1442493845028102145, array('I', [6, 13])], [1442474165735407616, array('I', [7])], [1441463208620011520, array('I', [6])], [1440973666855043074, array('I', [10])], [1440923415117017090, array('I', [0])], [1439194773408071682, array('I', [2])], [1437741809753538561, array('I', [8])], [1437436684023615495, array('I', [9])], [1437436381479964672, array('I', [4])], [1435613091987927042, array('I', [8])], [1435538492503863299, array('I'