In [1]:
import nltk
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import json
import re
import string

In [2]:
path = 'dataset_tweets_WHO.txt'

#convert the text to json
with open(path) as f:
    tweets_json = json.load(f)

In [3]:
def remove_punct(line):
    """
    Helper function to remove punctuation
    
    Arugment:
    line -- string of text
    
    Returns:
    line -- string of text without punctuation
    """
    return line.translate(str.maketrans('', '', string.punctuation.replace('#', '')))

def build_terms(line):
    """
    Preprocess the Tweet text by removing stop words, emojis, and punctuation and
    stemming, transforming to lowercase and returning the tokens of the text.
    
    Argument:
    line -- string (text) to be preprocessed
    
    Returns:
    line -- a list of tokens corresponding to the input text after the preprocessing
    """

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    
    # transform to lowercase 
    line =  line.lower() 
    
    # remove non-ASCII terms like emojis and symbols
    line = "".join(c for c in line if c in string.printable) 
    
    # remove punctuation
    line = remove_punct(line)
    
    # tokenize the text to get a list of terms
    line = line.split() 
    
    # remove html tags, blank spaces like '', and urls
    line = [word for word in line if not (re.match("^qampa$" , word) or re.match("^amp$" , word) or re.match("^http" , word)) 
    and word] 
    
    # remove standalone numbers e.x. '19' but not the 19 from 'covid19'
    line = [word for word in line if not word.isnumeric()]
    
    # remove stopwords
    line = [word for word in line if word not in stop_words] 
    
    # perform stemming
    line = [stemmer.stem(word) for word in line]
    
    # add unhashtagged word if it's hashtag is present 
    # e.x. if #covid is present, we also add covid as a token
    line = line + [word.replace('#', '') for word in line if word[0] == '#' ] 
    
    return line

In [5]:
# tweet_dict is our output datastructure that maps Tweet IDs to their text
tweet_dict = defaultdict()
for key in tweets_json:
    id = tweets_json[key]['id']
    text = build_terms(tweets_json[key]['full_text'])
    tweet_dict[id] = text


"""
for key in tweets_json:
    key = str(key)
    tweets_json[key]['full_text'] = build_terms(tweets_json[key]['full_text'])
"""

"\nfor key in tweets_json:\n    key = str(key)\n    tweets_json[key]['full_text'] = build_terms(tweets_json[key]['full_text'])\n"

In [7]:
#print tweets
for key in tweet_dict:
    print("id: ", key, "\nText:\n", tweet_dict[key])

id:  1448215930178310144 
Text:
 ['intern', 'day', 'disast', 'risk', 'reduct', '#openwho', 'launch', 'multiti', 'core', 'curriculum', 'help', 'equip', 'compet', 'need', 'work', 'within', 'public', 'health', 'emerg', 'respons', 'start', 'learn', 'today', '#ready4respons', 'openwho', 'ready4respons']
id:  1448208458604584960 
Text:
 ['#covid19', 'shown', 'health', 'emerg', 'disast', 'affect', 'entir', 'commun', 'especi', 'weak', 'health', 'system', 'vulner', 'popul', 'like', 'migrant', 'indigen', 'peopl', 'live', 'fragil', 'humanitarian', 'condit', 'covid19']
id:  1448195167048118274 
Text:
 ['intern', 'day', 'disast', 'risk', 'reduct', 'better', 'respond', 'emerg', 'countri', 'must', 'invest', 'health', 'care', 'system', 'achiev', 'gender', 'equiti', 'protect', 'marginalis', 'group', 'ensur', 'readi', 'equit', 'access', 'suppli', 'strong', 'resili', 'health', 'system']
id:  1448163447678676992 
Text:
 ['rt', 'whoafro', 'congratul', 'algeria', '#algeria', '16th', 'countri', '#africa', 'r

id:  1440671322602819596 
Text:
 ['everi', 'year', 'exposur', '#airpollut', 'estim', 'caus', 'million', 'prematur', 'death', 'result', 'loss', 'million', 'healthi', 'year', 'life', 'guidelin', 'recommend', 'air', 'qualiti', 'level', 'protect', 'health', 'save', 'live', 'around', 'airpollut']
id:  1440669759188529154 
Text:
 ['urg', 'countri', 'put', 'air', 'qualiti', 'guidelin', 'use', 'save', 'live', 'support', 'healthi', 'commun', 'help', 'address', 'climat', 'crisi', 'guidelin', 'come', 'import', 'time', 'ahead', 'cop26', '#climatechang', 'confer', 'novemberdrtedro', '#airpollut', 'climatechang', 'airpollut']
id:  1440669757275922441 
Text:
 ['job', 'health', 'sector', 'alon', 'requir', 'allofgovern', 'allofsocieti', 'approach', 'improv', 'govern', 'air', 'qualiti', 'monitor', '#airpollut', 'risk', 'engag', 'econom', 'sector', 'reduc', 'emissionsdrtedro', 'airpollut']
id:  1440669754948128774 
Text:
 ['dedic', 'support', 'countri', 'fight', '#airpollut', 'build', 'capac', 'knowledg'

id:  1428303310386311168 
Text:
 ['live', 'twitterspac', 'join', 'us', 'ask', '#climateact', 'question', 'drmarianeira', '#askwho', 'climateact', 'askwho']
id:  1428302340105330688 
Text:
 ['rt', 'join', '#climateact', 'talk', 'drmarianeira', 'cest', '#worldhumanitarianday', '#thehumanrac', 'climateact', 'worldhumanitarianday', 'thehumanrac']
id:  1428298318254415874 
Text:
 ['#worldhumanitarianday', 'climat', 'emerg', 'humanitarian', 'crisi', 'must', 'maximis', 'health', 'benefit', 'tackl', '#climatecrisi', 'avoid', 'worst', 'health', 'impact', 'promot', 'climateresili', 'health', 'system', 'everywher', 'worldhumanitarianday', 'climatecrisi']
id:  1428297727105019906 
Text:
 ['rt', 'whoafro', 'live', 'join', 'whoafro', 'press', 'confer', '#covid19', 'pandem', 'africa', '#ebola', 'outbreak', 'cote', 'divoir', '#marburg', 'outbr', 'covid19', 'ebola', 'marburg']
id:  1428284364035604483 
Text:
 ['climaterel', 'disast', 'like', 'flood', 'drought', 'heat', 'wave', 'doubl', 'past', 'year', 

id:  1419609283751260161 
Text:
 ['learn', 'strategi', 'prevent', 'monitor', 'manag', '#covid19', 'infect', 'amongst', 'health', 'worker', 'free', 'onlin', 'cours', '#openwho', 'covid19', 'openwho']
id:  1419554743064612866 
Text:
 ['rt', 'drtedro', 'visit', 'sever', '#covid19', 'vaccin', 'site', '#bahrain', 'today', 'hospit', 'exhibit', 'centr', 'shop', 'mall', 'congratula', 'covid19', 'bahrain']
id:  1419554544070107143 
Text:
 ['rt', 'drtedro', 'honour', 'opportun', 'open', '152nd', 'countri', 'offic', 'kingdom', 'offic', 'deliv', 'strateg', 'te']
id:  1419547416068595712 
Text:
 ['countri', 'offic', 'kingdom', '#bahrain', 'offici', 'open', '#healthforal', 'bahrain', 'healthforal']
id:  1419546493917319168 
Text:
 ['assur', 'who', 'support', 'kingdom', '#bahrain', 'continu', 'commit', 'work', 'promot', 'health', 'keep', 'world', 'safe', 'serv', 'vulner', 'bahrain', 'around', 'worlddrtedro', 'bahrain']
id:  1419546491547525120 
Text:
 ['framework', 'convent', 'intern', 'instrument', 

id:  1408450061059690497 
Text:
 ['drtedro', 'greec', 'team', 'norway', 'germani', 'deploy', 'follow', 'fire', 'refuge', 'camp', 'island', 'lesvo', 'support', '#covid19', 'respons', 'togeth', 'essenti', 'health', 'servicesdrtedro', 'covid19']
id:  1408449950720180225 
Text:
 ['drtedro', 'recent', 'deploy', 'emerg', 'medic', 'team', 'djibouti', 'papua', 'new', 'guinea', 'fiji', 'costa', 'ricadrtedro']
id:  1408449858365710339 
Text:
 ['drtedro', 'sinc', 'begin', '#covid19', 'pandem', 'facilit', 'deploy', 'intern', 'emt', 'expert', 'worldwid', 'team', 'provid', 'special', 'care', 'patient', 'addit', 'bed', 'capac', 'advic', 'local', 'health', 'provid', 'case', 'managementdrtedro', 'covid19']
id:  1408449677490536448 
Text:
 ['drtedro', 'countri', 'either', 'team', 'process', 'certifi', 'develop', 'system', 'qualityassur', 'nation', 'teamsdrtedro']
id:  1408449598276964352 
Text:
 ['drtedro', 'global', 'certifi', 'team', 'countri', 'gone', 'rigor', 'process', 'qualiti', 'assur', 'ensur', 