In [5]:
import pandas as pd
import re
import os
import numpy as np
import time
import datetime

# Folium
import folium 

# geotext
from geotext import GeoText
import geocoder

# SciPy
#from scipy import stats

# NLTK
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# fuzzywuzzy
from fuzzywuzzy import fuzz # install python-Levenshtein afterwards for speeding up things

# gensim
from gensim.models import TfidfModel
from gensim import corpora, models
from gensim.similarities import Similarity
from gensim.test.utils import datapath, get_tmpfile

# sklearn
#from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt to /home/lefko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/lefko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/lefko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
# global properties
workDir = '/home/lefko/git/ut-health-project'
dbfile = workDir + '/db/tweets.csv'
states = workDir + '/data/us-states-abbr.csv'

_stopwords = ['AT_USER', 'URL'] + stopwords.words('english')
_lem = WordNetLemmatizer()
sample = 0

In [7]:
# check if the tweets file can be accessed
try:
    f = open(dbfile)
except IOError:
    print("File not accessible")
finally:
    f.close()


In [8]:
'''
    some utility function
'''
def readCitiesFromFile(state):
    cities = []
    with open(workDir + '/data/cities_' + state + '.txt') as txt:
        for line in txt:
            cities.append(line.strip().lower())

    return pd.DataFrame(cities, columns = ['city'])

def readKeywords():
    keywords = []

    with open(workDir + '/data/keywords.txt') as txt:
        for line in txt:
            keywords.append(line.strip().lower())

    print(keywords)
    return keywords


In [9]:
cities_colorado = readCitiesFromFile('co') # Colorado cities
cities_california = readCitiesFromFile('ca') # California cities
cities_massachussetts = readCitiesFromFile('ma') # Massachussetts cities
cities_all = cities_california + cities_colorado + cities_massachussetts # all 3 of them combined

states_abbr = pd.read_csv(states, sep=',') # official abbreviations for the states
states_abbr = states_abbr.applymap(lambda s:s.lower())

In [10]:
abbrs = []
for i in range(len(states_abbr)):
    abbrs.append(states_abbr.iloc[:, 1][i].strip())
    #abbrs.append(states_abbr.iloc[:, 1][i])

_stopwords += abbrs # add the abbreviations to the list of stopwords aswell
print(_stopwords)

['AT_USER', 'URL', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same

In [11]:
%time tweets = pd.read_csv(dbfile, names=['tweet_id', 'created', 'loc', 'text'], header=None) # create a new DataFrame, which holds the tweets

if(sample > 0):
    tweets = tweets[0:sample]

print('tweets dataframe created!')
tweets = tweets[tweets['loc'].isnull() == False] # sort out NaN places
tweets = tweets[tweets['loc'].str.contains('\d') == False] # sort out tweets where the location contains numbers
tweets = tweets.drop(['tweet_id', 'created'], axis=1) # they are not needed, actually
%time tweets = tweets.applymap(lambda s:s.lower() if type(s) == str else str(s)) # lower all strings
print('all strings are lowered')

CPU times: user 1.53 s, sys: 201 ms, total: 1.74 s
Wall time: 1.8 s
tweets dataframe created!
CPU times: user 919 ms, sys: 91.7 ms, total: 1.01 s
Wall time: 1.02 s
all strings are lowered


In [12]:
# How many tweets are there in total now?
print(len(tweets))
sample = len(tweets)

421829


In [13]:
# How many unique locations?
print(len(tweets['loc'].unique()), 'unique locations found')
# Average tweet length?
print(np.mean(tweets['text'].str.len()), 'average length of a tweet')

40217 unique locations found
76.60094493266229 average length of a tweet


## NLP Tweet processing
### Tokenization, Stopwords and Lemmatization

In [14]:
%%time
# tweet tokenization and stopword removal
def processTweet(tweet_text):
    tweet_text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet_text) # remove URLs
    tweet_text = re.sub('@[^\s]+', 'AT_USER', tweet_text) # remove usernames
    tweet_text = re.sub(r'#([^\s]+)', r'\1', tweet_text) # remove the # in #hashtag
    tweet_text = [word for word in word_tokenize(tweet_text) if word.isalpha() == True and word not in _stopwords] # tokenize the text
    tweet_text = [_lem.lemmatize(word,'v') for word in tweet_text] # lemmatize
    return tweet_text

tweets['processed_text']=''
processed_tweets = []
for i in range(sample):
    tweet = tweets.iloc[i]
    processed_tweets.append(processTweet(tweet['text']))
    #tweet.iloc[i]['processed_text'] = processTweet(tweet['text'])

tweets['processed_text'][0:sample] = processed_tweets

CPU times: user 3min 44s, sys: 626 ms, total: 3min 44s
Wall time: 3min 45s


In [15]:
%time tweets.head(10)

CPU times: user 519 µs, sys: 0 ns, total: 519 µs
Wall time: 499 µs


Unnamed: 0,loc,text,processed_text
0,all over the place,@jordanmastagni3 \r\n\r\nthank you for the fol...,"[thank, follow, jordan]"
1,los angeles ca,can you recommend anyone for this #skilledtrad...,"[recommend, anyone, skilledtrade, job, palmdal..."
2,redding ca,rehearsals have been going well. we’re set to ...,"[rehearsals, go, well, set, film, rest, new, m..."
3,pine and buckeye az,2013 please read it. let's media spread their ...,"[please, read, let, media, spread, lie, well, ..."
4,montclair ca,@gennefer @davidgrosstv very cool!!!!,[cool]
5,hanford ca,@mholder95 i thought it was hilarious. if you ...,"[think, hilarious, laugh, lose, mind, humor, s..."
6,hollister ca,these days i balance all the hate out with the...,"[days, balance, hate, love]"
7,marina del rey ca,works out good for the canes since we just got...,"[work, good, can, since, get, beat, bobby, bou..."
8,palm springs calif,impeach nancy?,"[impeach, nancy]"
9,laguna beach ca,getting a little love jeanninesrestaurants whi...,"[get, little, love, jeanninesrestaurants, spur..."


### Vectorization
Let's represent our tweets as numerical values, i.e. vectors in a vector space

Note that, a ‘token’ typically means a ‘word’. A ‘document’ can typically refer to a ‘sentence’ or ‘paragraph’ and a ‘corpus’ is typically a ‘collection of documents as a bag of words’.

https://dev.to/coderasha/compare-documents-similarity-using-python-nlp-4odp

In [16]:
%%time

'''
In order to work on text documents, Gensim requires the words (aka tokens) be converted to unique ids. So, Gensim lets you create a Dictionary object that maps each word to a unique id. Let's convert our sentences to a [list of words] and pass it to the corpora.Dictionary() object.
'''
corpDict = corpora.Dictionary(tweets['processed_text'])
corpus = [corpDict.doc2bow(text) for text in tweets['processed_text']]

CPU times: user 12.7 s, sys: 104 ms, total: 12.8 s
Wall time: 12.9 s


Corpus is a Bag of Words. It is a basically object that contains the word id and its frequency in each document (just lists the number of times each word occurs in the sentence).

In [17]:
'''
Term Frequency – Inverse Document Frequency(TF-IDF) is also a bag-of-words model but unlike the regular corpus, TFIDF down weights tokens (words) that appears frequently across documents.

Tf-Idf is calculated by multiplying a local component (TF) with a global component (IDF) and optionally normalizing the result to unit length. Term frequency is how often the word shows up in the document and inverse document frequency scales the value by how rare the word is in the corpus. In simple terms, words that occur more frequently across the documents get smaller weights.
'''
tf_idf = TfidfModel(corpus)

## Setup Similarity measure

In [18]:
# load some illness-related keywords
kws = readKeywords()
print(len(kws), 'keywords loaded!')
# why not use WordNet synonyms here?

['ill', 'sick', 'cold', 'flu', 'influenza', 'disease', 'weak', 'cough', 'headache', 'stomach']
10 keywords loaded!


In [19]:
%%time
# We are storing index matrix in 'data' directory
index_temp = get_tmpfile('index')
'''
    Similarity builds an index for a given set of documents. 
    Once the index is built, you can perform efficient queries like “Tell me how similar is this query document to each document in the index?”. The result is a    vector of numbers as large as the size of the initial set of documents, that is, one float for each index document. Alternatively, you can also request only the top-N most similar index documents to the query.
'''
sims = Similarity(index_temp,tf_idf[corpus], num_features=len(corpDict))

# update the already existing dictionary with the keywords
kws_bow = corpDict.doc2bow(kws)

CPU times: user 2min 27s, sys: 936 ms, total: 2min 28s
Wall time: 2min 28s


In [21]:
%%time
# perform a similarity query against the corpus
kws_bow_tf_idf = tf_idf[kws_bow]
# print(document_number, document_similarity)
#print('Comparing Result:', sims[kws_bow_tf_idf]) 
similarities = sims[kws_bow_tf_idf]

CPU times: user 56.4 ms, sys: 4 ms, total: 60.4 ms
Wall time: 61.8 ms


In [22]:
idx = np.where(similarities > .15)
print(tweets.iloc[idx][['processed_text', 'text', 'loc']])
print(tweets.iloc[idx]['loc'].unique())

processed_text  \
50                                       [sick, everyone]   
269                                            [headache]   
1034    [vet, rescue, cough, think, kennel, cough, go,...   
1384                                   [die, stomach, do]   
1406                    [headache, morning, whole, level]   
...                                                   ...   
510817                       [wonder, catch, flu, flight]   
511019                       [never, felt, sick, stomach]   
511481  [update, nearly, month, deathly, sick, finally...   
511579                                      [cold, worth]   
511604  [someone, buy, ill, able, buy, posca, pen, ill...   

                                                     text               loc  
50                                      sick of everyone.       sur califas  
269                                   i have a headache 🤕        california  
1034    at the vet with my 18 y/o rescue. has a cough....    california usa 

838 tweets are somewhat similiar/contain some of our keywords in question

In [23]:
# Final step (?): identify actual geolocation entities within the location
disease_tweets = tweets.iloc[idx]
#for index, t in disease_tweets.iterrows():
#    places = GeoText(t['loc'].title()) # first letter needs to be uppercase
#    disease_tweets = disease_tweets.append({'real_loc':places.cities if len(places.cities) > 0 else ''}, ignore_index = True)
    #tweets.iloc[index]['real_loc'] = places.cities if len(places.cities) > 0 else None
    #print(places.country_mentions)
#print(GeoText('new York').cities)

In [26]:
disease_tweets.head(50)

Unnamed: 0,loc,text,processed_text
50,sur califas,sick of everyone.,"[sick, everyone]"
269,california,i have a headache 🤕,[headache]
1034,california usa,at the vet with my 18 y/o rescue. has a cough....,"[vet, rescue, cough, think, kennel, cough, go,..."
1384,omashu,i’m dying my stomach done,"[die, stomach, do]"
1406,deeetroit,the headache this morning is on a whole other ...,"[headache, morning, whole, level]"
1557,southwest atlanta,this how that flu medicine had me last night,"[flu, medicine, last, night]"
1746,houston,(cough. iowans. cough.),"[cough, iowans, cough]"
1856,mars,i’m weak 😂,[weak]
2193,paradise,i’m just so sick of waiting,"[sick, wait]"
2197,chicago il,inexplicably! i woke up with a headache,"[inexplicably, wake, headache]"


In [25]:
# Now let's plot the tweets together with it's location on a map
latitude = 37.0902 # USA
longitude = -95.7129 # USA

tweets_map = folium.Map(location=[latitude, longitude], zoom_start=5)
for i, t in disease_tweets.iterrows():
    # check if a real location name can be inferred
    places = GeoText(t['loc'].title())
    if(len(places.cities) > 0):
        gn = geocoder.geonames(places.cities[0], key = 'lefkokills') # access geonames webservice
        folium.CircleMarker(
            [gn.lat, gn.lng],
            radius=1.5,
            popup = ('City: ' + places.cities[0] + '<br>'
                'Tweet: ' + t['text']
                ),
        color="#007849",
        #key_on = traffic_q,
        #threshold_scale=[0,1,2,3],
        #fill_color=colordict[traffic_q],
        fill=True,
        fill_opacity=0.7
        ).add_to(tweets_map)

tweets_map.save(workDir + '/tweets_and_locations_' + datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H:%M:%S') + '.html')    
tweets_map

## STOP HERE, BACKUP

In [0]:
%%cache
'''
With the need to do text clustering at sentence level there will be one extra step for moving from word level to sentence level. For each sentence from the set of sentences, word embedding of each word is summed and in the end divided by number of words in the sentence. So we are getting average of all word embeddings for each sentence and use them as we would use embeddings at word level
'''
def sent_vectorizer(sent, model):
    sent_vec =[]
    numw = 0
    for w in sent:
        try:
            if numw == 0:
                sent_vec = model[w]
            else:
                sent_vec = np.add(sent_vec, model[w])
            numw+=1
        except:
            pass
     
    return np.asarray(sent_vec) / numw
  
  
X=[]
for pre_processed_tweet in tweets['processed_text']:
    X.append(sent_vectorizer(pre_processed_tweet, w2vmodel))

print(len(X), 'sentences are vectorised')

### Tweet Clustering (not yet done)

In [0]:
%%cache
def printLabels(labels, model):
    words = list(model.wv.vocab)
    for i, word in enumerate(words):  
        print(word + ":" + str(labels[i]))

km = KMeans(n_clusters=50, init='k-means++', max_iter=100, n_init=1, verbose=True)

print("Clustering sparse data with %s" % km)
%time km.fit(tweet_vecs)

labels = km.labels_ # assigned labels
centroids = km.cluster_centers_ # cluster centroids
#printLabels(labels, w2vmodel)


## A simpler approach (backup plan)

In [0]:
%%cache
%%time
# use fuzzy-matching to look for tweets containing these words (or similar ones)
# https://www.datacamp.com/community/tutorials/fuzzy-string-python
def simRatio(tweet_token, keyword):
    '''
        calculates the similarity between a token and a keyword
    '''
    return fuzz.ratio(tweet_token, keyword)

from operator import itemgetter
def max_val(l, i):
    return max(enumerate(map(itemgetter(i), l)),key=itemgetter(1))

ratios_per_tweet = pd.DataFrame(None, columns=['id', 'ratio_mean', 'ratio_clean', 'ratio_median', 'tokens'])
id = 0

for tweet_tokens in tweets['processed_text']:
    ratio = [simRatio(token, keyword) for token in tweet_tokens for keyword in kws]
    if(np.mean(ratio) > 25):
        #tweet_ids.append(id)
        ratios_per_tweet = ratios_per_tweet.append({'id':id, 'ratio_mean':np.mean(ratio), 'ratio_clean':np.asarray(ratio)/len(tweet_tokens), 'ratio_median':np.median(ratio), 'tokens':tweet_tokens}, ignore_index=True)

    id +=1

print(len(ratios_per_tweet), 'tweets found')

In [0]:
%%cache
ratios_per_tweet

In [0]:
%%cache
ratios_per_tweet[ratios_per_tweet['ratio_mean'] > 30]