## Import important libraries 


In [None]:
import pandas as pd
import numpy as np
import os 
import glob
import time 
import re
import nltk
from collections import Counter
from string import punctuation
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import StanfordPOSTagger
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download("averaged_perceptron_tagger") 


[nltk_data] Downloading package punkt to /home/kidist/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/kidist/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## load the dataset 

In [None]:
# Get all file path 

path = 'Health-Tweets'
all_files = glob.glob(path + "/*.txt")
# read dataset 
df_all = []
for filename in all_files:
    df= pd.read_csv(filename,sep='|',header=None,error_bad_lines=False,encoding="iso-8859-15",warn_bad_lines=False)
    df_all.append(df)


In [None]:
# merge all dataframes into a single dataframe
merged_df = pd.concat(df_all, axis=0, ignore_index=True)
merged_df.columns

Int64Index([0, 1, 2], dtype='int64')

In [None]:
# rename the columns name
merged_df.columns=["user_id","date","tweet"]
merged_df.columns

Index(['user_id', 'date', 'tweet'], dtype='object')

In [None]:
merged_df.head()

Unnamed: 0,user_id,date,tweet
0,586282503981375488,Thu Apr 09 21:40:16 +0000 2015,Los Angeles closes 500 medical marijuana shops...
1,586278524748750848,Thu Apr 09 21:24:27 +0000 2015,U.S. cuts poultry export forecast as deadly bi...
2,586273441801699328,Thu Apr 09 21:04:15 +0000 2015,Fears over Roundup herbicide residues prompt p...
3,586254712523096068,Thu Apr 09 19:49:50 +0000 2015,Liberia watchdog says some Ebola funds unaccou...
4,586243976333787137,Thu Apr 09 19:07:10 +0000 2015,Diabetes devices may interfere with avalanche ...


In [None]:
merged_df.shape

(62817, 3)

The total number of tweets in the dataset is 62817

## Split the texts into monthly  intervals

In [None]:
#  Convert string date to Datetime objects
merged_df['date'] = pd.to_datetime(merged_df['date'])


In [None]:
# Create new column month-year. This will be used to split the data into monthly interval
def get_month_and_year(x):
    return "{}-{}".format(x.month,x.year)
merged_df["month-year"] = merged_df["date"].apply(get_month_and_year)
merged_df["tweet"][0]

'Los Angeles closes 500 medical marijuana shops, but hundreds remain http://reut.rs/1CvkXmm'

In [None]:
# remove the tweet URL , number ,and username, because this items will not provide useful information. Specially
# numbers increase the number of unique words without any  siginificant gain in the information
def remove_url_num_uname(text):
    output = re.sub('http[s]?://\S+', '', text) 
    output = re.sub("\.{2,}","",output)
    output = re.sub('@[^\s]+','',output)
    output= re.sub("[^a-zA-Z]+"," ", output)
    return output
         

merged_df["tweet"]= merged_df["tweet"].apply(remove_url_num_uname)
merged_df["tweet"][0]


'Los Angeles closes medical marijuana shops but hundreds remain '

In [None]:
# group the dataframe into monthly interval
months_df = list(merged_df.groupby("month-year"))
months_df[21]

('3-2013',
                   user_id                      date  \
 7619   318416481232769024 2013-03-31 17:36:19+00:00   
 7620   318311103203520512 2013-03-31 10:37:35+00:00   
 7621   318199451233693697 2013-03-31 03:13:55+00:00   
 7622   318092722751823872 2013-03-30 20:09:49+00:00   
 7623   318062748787417088 2013-03-30 18:10:43+00:00   
 ...                   ...                       ...   
 50847  307555197092249600 2013-03-01 18:17:27+00:00   
 50848  307554254913163264 2013-03-01 18:13:43+00:00   
 50849  307553650211946497 2013-03-01 18:11:18+00:00   
 50850  307537038373179393 2013-03-01 17:05:18+00:00   
 50851  307536712463159296 2013-03-01 17:04:00+00:00   
 
                                                    tweet month-year  
 7619   Today s getfit tip We firmly believe that choc...     3-2013  
 7620               Girl tackles marathons on continents      3-2013  
 7621               It s FILTHY and lives in your wallet      3-2013  
 7622   Today s getfit tip Reac

In [None]:
len(months_df)

47

There are 47 months in the five years of data. This shows that some of the months are missing. 


## Text preprocessing 

In [None]:
# sentence tokenization function 
def get_tokens(text):
    return word_tokenize(text)


In [None]:
# apply word toknization 
for month,month_df in months_df:
    month_df["tokens"] =  month_df["tweet"].apply(get_tokens)

In [None]:
months_df[0][1].iloc[0]["tokens"]

['For', 'Kids', 'Laughter', 'Really', 'May', 'Be', 'the', 'Best', 'Medicine']

In [None]:
# function to remove stop words, panctuation , and empty string and lowercasing
stoplist = set(stopwords.words('english') + list(punctuation))
def remove_stopwords(tokens):
    output = []
    for word in tokens:
        if word not in stoplist and word.strip() != "":
            output.append(word.strip().lower())
    return output


In [None]:
# remove stop words, panctuation , and empty string and lowercasing
for month,month_df in months_df:
    month_df["tokens"] =  month_df["tokens"].apply(remove_stopwords)

In [None]:
months_df[0][1].iloc[0]["tokens"]

['for', 'kids', 'laughter', 'really', 'may', 'be', 'best', 'medicine']

In [None]:
#  Building the POS mapper for token tags
from nltk.corpus.reader.wordnet import VERB, NOUN, ADJ, ADV
dict_pos_map = {
    # Look for NN in the POS tag because all nouns begin with NN
    'NN': NOUN,
    'VB':VERB,
    'JJ' : ADJ,
    'RB':ADV,
    'VBG':VERB
}

In [None]:
# getting pos information
nltk.pos_tag(months_df[0][1].iloc[0]["tokens"]) 

[('for', 'IN'),
 ('kids', 'NNS'),
 ('laughter', 'RBR'),
 ('really', 'RB'),
 ('may', 'MD'),
 ('be', 'VB'),
 ('best', 'JJS'),
 ('medicine', 'NN')]

In [None]:
 # word lemitization function
def lemmitize_tokens(tokens):
    t = WordNetLemmatizer()
    outputs = []
    for word,pos in nltk.pos_tag(tokens):
        if pos in dict_pos_map:
            lemmatized_word = t.lemmatize(word, pos= dict_pos_map[pos])
            outputs.append(lemmatized_word)
        else:
            outputs.append(t.lemmatize(word))
    return outputs


In [None]:
# applay word lematization 
for month,month_df in months_df:
    month_df["lemm_tokens"] =  month_df["tokens"].apply(lemmitize_tokens)

In [None]:
months_df[0][1].head()

Unnamed: 0,user_id,date,tweet,month-year,tokens,lemm_tokens
32996,164481705472831488,2012-01-31 22:54:29+00:00,For Kids Laughter Really May Be the Best Medic...,1-2012,"[for, kids, laughter, really, may, be, best, m...","[for, kid, laughter, really, may, be, best, me..."
32997,164463284714610690,2012-01-31 21:41:17+00:00,FDA OKs Drug That Targets Rare Form of Cystic ...,1-2012,"[fda, oks, drug, that, targets, rare, form, cy...","[fda, ok, drug, that, target, rare, form, cyst..."
32998,164463283489878018,2012-01-31 21:41:17+00:00,Second Breast Cancer Surgery Sometimes Needed,1-2012,"[second, breast, cancer, surgery, sometimes, n...","[second, breast, cancer, surgery, sometimes, n..."
32999,164463282210611200,2012-01-31 21:41:16+00:00,Alternative to Colonoscopy Spots Cancers Too,1-2012,"[alternative, colonoscopy, spots, cancers, too]","[alternative, colonoscopy, spot, cancer, too]"
33000,164463280943939587,2012-01-31 21:41:16+00:00,Fatty Diet Before Pregnancy Linked to Gestatio...,1-2012,"[fatty, diet, before, pregnancy, linked, gesta...","[fatty, diet, before, pregnancy, linked, gesta..."


In [None]:
months_df[1][1].iloc[5]["tokens"]

['tv', 'ads', 'may', 'drinking', 'children', 'drink']

In [None]:
months_df[1][1].iloc[5]["lemm_tokens"]

['tv', 'ad', 'may', 'drink', 'child', 'drink']

## Unigram language model for every month

In [None]:
# function to builed word count for each month  and find total number of words occurrence 

def build_word_count(tokens):
    word_count = Counter()
    total_word_count = 0
    for li in tokens:
        total_word_count +=len(li)
        word_count.update(li)
    return dict(word_counts = word_count,total = total_word_count)


In [None]:
months_words_count = dict() # Frequency of words in a month.
for month,month_df in months_df:
    counts = build_word_count(month_df["lemm_tokens"])
    months_words_count[month]= counts
     

In [None]:
# for each month compute the probavlities of each word 
def compute_prob(word_counts,total):
    return { word:count/total for word,count in word_counts.items()}

In [None]:
# get the probablity of each words of the months 
months_words_prob = dict()
for month,month_words_count in months_words_count.items():
    probs = compute_prob(**month_words_count)
    months_words_prob[month]= probs


In [None]:
# unigram model
class UnigramLangugeModel:
    def __init__(self,month_words_prob):
        self.words =list( month_words_prob.keys())
        self.probs = list(month_words_prob.values())
        self.words_prob = month_words_prob
    def generate_word(self):
        return np.random.choice(self.words, p = self.probs)
    def generate_sentence(self,n):
        return " ".join([self.generate_word() for i in range(n)])
    def get_word_prob(self,word):
        if not word in self.words_prob:
            return 0
        return self.words_prob[word]

In [None]:
months_words_prob.keys()

dict_keys(['1-2012', '1-2013', '1-2014', '1-2015', '10-2011', '10-2012', '10-2013', '10-2014', '11-2011', '11-2012', '11-2013', '11-2014', '12-2011', '12-2012', '12-2013', '12-2014', '2-2012', '2-2013', '2-2014', '2-2015', '3-2012', '3-2013', '3-2014', '3-2015', '4-2012', '4-2013', '4-2014', '4-2015', '5-2012', '5-2013', '5-2014', '6-2011', '6-2012', '6-2013', '6-2014', '7-2011', '7-2012', '7-2013', '7-2014', '8-2011', '8-2012', '8-2013', '8-2014', '9-2011', '9-2012', '9-2013', '9-2014'])

In [None]:
model = UnigramLangugeModel(months_words_prob["1-2015"]) # Unigram model for January 2015 data.


In [None]:
model.generate_word() # Generate single word using the Unigram model

'meningitis'

In [None]:
model.generate_sentence(100) # Generate sentence with 100 words using the unigram model

'absolutely better opportunity disease exhaustion drink totally host take never house health notice apnea lab amp lead improve many danger say curly survivor complicated usntechchat dehydration pot medical amp adorable save success she bone mcdonald spill flu see safety please rt but health kind whole reason researcher rt outlive icm should take rt what inspired public truly aleppo say come i plea case glimpse time well nut call test drug appear see is donate how outbreak california privatisation child astrazeneca treatment chili check hard amgen hold care nh breast the clock a virus docs case e call make make pm'

In [None]:
# Building unigram model for each month 
lang_models = dict()
for month,month_words_count in months_words_count.items():
    model = UnigramLangugeModel(months_words_prob[month])
    lang_models[month]= model
    


In [None]:
lang_models.keys()


dict_keys(['1-2012', '1-2013', '1-2014', '1-2015', '10-2011', '10-2012', '10-2013', '10-2014', '11-2011', '11-2012', '11-2013', '11-2014', '12-2011', '12-2012', '12-2013', '12-2014', '2-2012', '2-2013', '2-2014', '2-2015', '3-2012', '3-2013', '3-2014', '3-2015', '4-2012', '4-2013', '4-2014', '4-2015', '5-2012', '5-2013', '5-2014', '6-2011', '6-2012', '6-2013', '6-2014', '7-2011', '7-2012', '7-2013', '7-2014', '8-2011', '8-2012', '8-2013', '8-2014', '9-2011', '9-2012', '9-2013', '9-2014'])

In [None]:
lang_models["1-2012"].generate_word()

'be'

## Calculate KL-divergence for these models and build pairwise representation 

![image.png](attachment:image.png)


### Comparing two models' probablities 

In [None]:
model_1= lang_models["1-2012"]
model_2 = lang_models["1-2013"]

In [None]:
model_1.get_word_prob("made")

0.000231000231000231

In [None]:
model_2.get_word_prob("made")

0.0003774297037176826

In [None]:
### Computing KL divergance between two models

In [None]:
def kl_divergence(p,q):
    dkl = p*np.log(p/q)
    return dkl
    

In [None]:
def dkl(model_1,model_2):
    output  = 0
    for word in model_1.words:
        p = model_1.get_word_prob(word)
        q = model_2.get_word_prob(word)
        if q != 0:
            output += kl_divergence(p,q)     
        
    return output

In [None]:
dkl(model_1,model_2)

0.6614172253206011

## Building pairwise matrix from the KL divergance result

In [None]:
months = list(lang_models.keys())


In [None]:
pairwise_matrix = np.zeros((len(months),len(months)))
pairwise_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
for i in range(len(months)):
    for j in range(len(months)):
        pairwise_matrix[i,j] = dkl(lang_models[months[i]],lang_models[months[j]])
        

In [None]:
print(pairwise_matrix)

[[0.         0.66141723 0.54392037 ... 0.28811025 0.46907418 0.73865714]
 [0.17912045 0.         0.30464566 ... 0.08876156 0.29059466 0.54169743]
 [0.12466507 0.37233368 0.         ... 0.10370461 0.11522305 0.32437324]
 ...
 [0.21528997 0.47138389 0.42181024 ... 0.         0.34383827 0.64964065]
 [0.16977653 0.47127292 0.22854255 ... 0.12713369 0.         0.43425258]
 [0.04152094 0.31527176 0.07992819 ... 0.15347459 0.07469613 0.        ]]


## Finding most added or removed words

On average if a words was most added or removed from  month to month it will have higher variance. Here I will use standard deviation to find most variying words.

In [None]:
# find unique words 
uniq_words = set()
for month in months :
    uniq_words.update(lang_models[month].words)
    


In [None]:
uniq_words = list(uniq_words)
uniq_words.sort()

In [None]:
len(uniq_words)

22084

In [None]:
months_all_words_count = np.zeros((len(uniq_words),len(months)))
months_all_words_count

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
for i in range(len(uniq_words)):
    for j in range(len(months)):
        if uniq_words[i] in months_words_count[months[j]]["word_counts"]:
            months_all_words_count[i,j] = months_words_count[months[j]]["word_counts"][uniq_words[i]]
    

In [None]:
words_stds = months_all_words_count.std(axis = 1)

In [None]:
topk = words_stds[words_stds.argsort()[-3:]]

In [None]:
topk

array([ 90.92331075, 173.79803351, 288.76498862])

In [None]:
words_stds.max()

288.7649886169951

In [None]:
months_all_words_count

array([[ 10., 242., 102., ...,  21.,  73., 174.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       ...,
       [  1.,   2.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.]])

In [None]:
words_stds

array([71.75456129,  0.20184751,  0.14430489, ...,  0.54494674,
        0.20184751,  0.14430489])

In [None]:
uniq_words = np.array(uniq_words)

In [None]:
uniq_words[words_stds.argsort()[-30:]]

array(['year', 'cancer', 'well', 'make', 'to', 'doctor', 'care', 'via',
       'today', 'may', 'nhs', 'how', 'hospital', 'food', 'drug', 'get',
       'nh', 's', 'q', 'amp', 'patient', 'say', 'new', 'u', 'the', 'a',
       'health', 'healthtalk', 'rt', 'ebola'], dtype='<U38')

In [None]:
words_stds[words_stds.argsort()[-30:]]

array([ 32.3405935 ,  32.34301501,  32.83351698,  32.91186084,
        33.10900826,  33.37950025,  33.79123991,  33.84435656,
        34.81225595,  35.46934228,  37.1933638 ,  37.78924664,
        39.85211458,  45.09747001,  46.51583549,  47.30085365,
        49.4313705 ,  49.65235412,  51.04128048,  51.11979203,
        53.29773219,  58.37271171,  65.34973987,  65.75797566,
        66.40850573,  71.75456129,  87.36493553,  90.92331075,
       173.79803351, 288.76498862])

As it can be seen here the top varying words are `ebola`, `rt`, `healthtalk`, etc.