In [1]:
import pandas as pd
import langid
from nltk.tokenize import RegexpTokenizer
import nltk

excel = pd.ExcelFile('sample.xlsx')

In [2]:
excel.sheet_names

['2020-03-22',
 '2020-03-23',
 '2020-03-24',
 '2020-03-25',
 '2020-03-26',
 '2020-03-27',
 '2020-03-28',
 '2020-03-29']

## Put into dict

In [3]:
d = {}

In [4]:
for n in excel.sheet_names:
    d[n] = []

    # read a sheet
    sheet = pd.read_excel('sample.xlsx', sheet_name = n)
    # clean up the sheet
    sheet = sheet.dropna(axis = 1, how = 'all').reset_index().drop(['index'],axis = 1)
    
    if sheet.columns[0] == 'text':
        pass
    else:
        sheet.dropna(axis = 0, inplace = True)
        sheet = sheet.reset_index()
        sheet.drop(['index'],axis = 1, inplace = True)
        sheet.columns = sheet.loc[0]
        sheet.drop(0, inplace = True)
        sheet = sheet.reset_index()
        sheet.drop(['index'],axis = 1, inplace = True)
    
    # change the data type of text column
    sheet['text'] = sheet['text'].astype(str)
    
    # remove duplicates
    sheet.drop_duplicates(inplace = True)
    sheet = sheet.reset_index()
    
    # check english, if yes, put to dict
    for i in range(len(sheet['text'])):
        if langid.classify(sheet['text'][i])[0] == 'en':
            d[n].append(sheet['text'][i])

## 1. Tokenisation & lowercasing

In [5]:
token_dict = {}
tokenizer = RegexpTokenizer(r"[a-zA-Z]+(?:[-'][a-zA-Z]+)?")

for k in d.keys():
    token_dict[k] = []
    
    for j in range(len(d[k])):
        tokens = tokenizer.tokenize(d[k][j].lower())
        token_dict[k].append(tokens)

In [6]:
from itertools import chain
import itertools
#from nltk.tokenize import MWETokenizer
## Collapse token dict for a day

for k in token_dict.keys():
    token_dict[k] = list(chain.from_iterable(token_dict[k]))

## 2. Remove stop words

### 2.1 Remove context independent stopwords

In [8]:
f = open('stopwords_en.txt', 'r')
stop_words = set(f.read().split())
f.close()

# remove most common and rare token
for k in token_dict.keys():
    for i in range(len(token_dict[k])-1,-1,-1):
        if token_dict[k][i] in stop_words:
            token_dict[k].pop(i)

### 2.2 Remove context dependent stopwords

In [9]:
# count document frequency
doc_freq_dict = {}

for k in token_dict.keys():
    for i in range(len(token_dict[k])):
        if doc_freq_dict.get(token_dict[k][i]) is None:
            doc_freq_dict[token_dict[k][i]] = set([k])
        else:
            doc_freq_dict[token_dict[k][i]].add(k)
    

In [10]:
removed_set_most = set()

for k in doc_freq_dict.keys():
    if len(doc_freq_dict[k]) > 60:
        removed_set_most.add(k)
        

In [11]:
# remove most common token
for k in token_dict.keys():
    for i in range(len(token_dict[k])-1,-1,-1):
        if token_dict[k][i] in removed_set_most:
            token_dict[k].pop(i)

# 3. Bigram 

## Flatten dict to extract bigram

In [7]:
all_words = list(chain.from_iterable(token_dict.values()))

### extract bigram tokens 200 using pmi

In [8]:
import nltk
## find the first 200 bigram tokens using pmi measures

bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(all_words)
bi_token_200 = finder.nbest(bigram_measures.pmi, 200)

In [9]:
len(bi_token_200)

200

In [10]:
bi_token_200

[('aabwgr', 'kuu'),
 ('aacizy', 'cjn'),
 ('aadhar', 'passport'),
 ('aaek', 'etwt'),
 ('aaeq', 'mdqrq'),
 ('aap', 'kahe'),
 ('aapl', 'afgw'),
 ('abarrmla', 'jennymikakos'),
 ('abbsbgg', 'cosmic'),
 ('abby', 'ogah'),
 ('abcpoppins', 'edwardjdavey'),
 ('abd', 'krakstv'),
 ('abh', 'toh'),
 ('abhishekaitc', 'derekobrienmp'),
 ('ableg', 'abpoli'),
 ('aboutlastnight', 'groceryshopping'),
 ('abouts', 'bsc'),
 ('abphe', 'cmc'),
 ('abpoli', 'canlab'),
 ('abramjee', 'maggsnaidu'),
 ('abu', 'ubaidah'),
 ('abvp', 'ncc'),
 ('abzbaa', 'xnv'),
 ('acclimatise', 'env'),
 ('acgrgk', 'wagsntales'),
 ('acl', 'kimkardashian'),
 ('aclfcf', 'bto'),
 ('acoustic', 'antimainstream'),
 ('acthealth', 'nthealth'),
 ('actress', 'angelinajolie'),
 ('acz', 'fpzl'),
 ('adamliptak', 'mikiebarb'),
 ('adblvvve', 'repangiecraig'),
 ('addxfg', 'tee'),
 ('ademjswva', 'hlatseentle'),
 ('adesh', 'nkmk'),
 ('adesso', 'hover'),
 ('adhanom', 'ghebreyesus'),
 ('adhvik', 'mahajan'),
 ('adhvikmahajan', 'shashankvyas'),
 ('adia', 'be

## change to bigram token

In [11]:
from nltk.util import ngrams

bi_token_dict = {}

for k in token_dict.keys():
    
    bi_token_dict[k]= list(ngrams(token_dict[k],2))

### Check document frequency of bigram token

In [12]:
bi_token_dict_cut = {}

In [13]:
bi_token_200_set = set(bi_token_200)

for k in bi_token_dict.keys():
    bi_token_dict_cut[k] = [i for i in bi_token_dict[k] if i in bi_token_200_set]

In [14]:
bi_token_dict_cut

{'2020-03-22': [('adesso', 'hover'),
  ('ableg', 'abpoli'),
  ('abpoli', 'canlab'),
  ('anothet', 'chutiya'),
  ('aigq', 'ucq'),
  ('addxfg', 'tee'),
  ('andover', 'townsman'),
  ('amzp', 'fjlef'),
  ('anadoluagency', 'amberinzaman'),
  ('amberinzaman', 'helal'),
  ('abu', 'ubaidah'),
  ('andymadaki', 'knetokor'),
  ('abzbaa', 'xnv'),
  ('alec', 'stapp'),
  ('ahjmgxlu', 'rosehavenhill'),
  ('alise', 'coen'),
  ('alcu', 'wyl'),
  ('adesh', 'nkmk'),
  ('akygz', 'baou'),
  ('akuffo', 'addo'),
  ('albomp', 'woolworths'),
  ('aap', 'kahe'),
  ('akhand', 'bharat'),
  ('alq', 'puvx')],
 '2020-03-23': [('antigenic', 'sin'),
  ('abramjee', 'maggsnaidu'),
  ('alsisiofficial', 'nayibbukele'),
  ('ameyaw', 'gyaigyimii'),
  ('amigo', 'cuarentena'),
  ('adhanom', 'ghebreyesus'),
  ('aehtcc', 'hrr'),
  ('aif', 'mnhrj'),
  ('alhlwan', 'spencerjames'),
  ('amanda', 'aphane'),
  ('annasantoz', 'trazlersgal'),
  ('andyage', 'mrcesarcrespo'),
  ('admiral', 'sayyari'),
  ('anti-hiv', 'protease'),
  ('aeerx

In [15]:
bi_token_dict_cut_reversed = {}

for k in bi_token_dict_cut.keys():
    for i in range(len(bi_token_dict_cut[k])):
        if bi_token_dict_cut_reversed.get(bi_token_dict_cut[k][i]) is None:
            bi_token_dict_cut_reversed[bi_token_dict_cut[k][i]] = set([k])
        else:
            bi_token_dict_cut_reversed[bi_token_dict_cut[k][i]].add(k)

In [16]:
bi_token_dict_cut_reversed

{('adesso', 'hover'): {'2020-03-22'},
 ('ableg', 'abpoli'): {'2020-03-22'},
 ('abpoli', 'canlab'): {'2020-03-22'},
 ('anothet', 'chutiya'): {'2020-03-22'},
 ('aigq', 'ucq'): {'2020-03-22'},
 ('addxfg', 'tee'): {'2020-03-22'},
 ('andover', 'townsman'): {'2020-03-22'},
 ('amzp', 'fjlef'): {'2020-03-22'},
 ('anadoluagency', 'amberinzaman'): {'2020-03-22'},
 ('amberinzaman', 'helal'): {'2020-03-22'},
 ('abu', 'ubaidah'): {'2020-03-22'},
 ('andymadaki', 'knetokor'): {'2020-03-22'},
 ('abzbaa', 'xnv'): {'2020-03-22'},
 ('alec', 'stapp'): {'2020-03-22'},
 ('ahjmgxlu', 'rosehavenhill'): {'2020-03-22'},
 ('alise', 'coen'): {'2020-03-22'},
 ('alcu', 'wyl'): {'2020-03-22'},
 ('adesh', 'nkmk'): {'2020-03-22'},
 ('akygz', 'baou'): {'2020-03-22'},
 ('akuffo', 'addo'): {'2020-03-22'},
 ('albomp', 'woolworths'): {'2020-03-22'},
 ('aap', 'kahe'): {'2020-03-22'},
 ('akhand', 'bharat'): {'2020-03-22'},
 ('alq', 'puvx'): {'2020-03-22'},
 ('antigenic', 'sin'): {'2020-03-23'},
 ('abramjee', 'maggsnaidu'): {

In [17]:
bi_token_set = set()

for k in bi_token_dict_cut_reversed.keys():
    if len(bi_token_dict_cut_reversed[k]) > 5:
        bi_token_list.set(k)
        
bi_token_set

set()

### Count document frequency

In [84]:
#bi_token_dict

{'2020-03-22': [('packed', 'uk'),
  ('uk', 'amid'),
  ('amid', 'rising'),
  ('rising', 'covid'),
  ('covid', 'cases'),
  ('cases', 'social'),
  ('social', 'media'),
  ('media', 'free'),
  ('free', 'malaysia'),
  ('malaysia', 'today'),
  ('today', 'https'),
  ('https', 'gu'),
  ('gu', 'world'),
  ('world', 'https'),
  ('https', 'https'),
  ('https', 'realdonaldtrump'),
  ('realdonaldtrump', 'asshole'),
  ('asshole', 'called'),
  ('called', 'covid'),
  ('covid', 'https'),
  ('https', 'watch'),
  ('watch', 'tv'),
  ('tv', 'wash'),
  ('wash', 'hands'),
  ('hands', 'exercise'),
  ('exercise', 'home'),
  ('home', 'read'),
  ('read', 'damn'),
  ('damn', 'book'),
  ('book', 'learn'),
  ('learn', 'ability'),
  ('ability', 'leave'),
  ('leave', 'house'),
  ('house', 'risk'),
  ('risk', 'amp'),
  ('amp', 'covid'),
  ('covid', 'stayathome'),
  ('stayathome', 'scary'),
  ('scary', 'stay'),
  ('stay', 'safe'),
  ('safe', 'isolation'),
  ('isolation', 'https'),
  ('https', 'covid'),
  ('covid', 'nige

In [18]:
doc_freq_dict_bi = {}

for k in bi_token_dict.keys():
    for i in range(len(bi_token_dict[k])):
        if doc_freq_dict_bi.get(bi_token_dict[k][i]) is None:
            doc_freq_dict_bi[bi_token_dict[k][i]]  = set([k])
        else:
            doc_freq_dict_bi[bi_token_dict[k][i]].add(k)

In [19]:
doc_freq_dict_bi

{('packed', 'uk'): {'2020-03-22'},
 ('uk', 'concerts'): {'2020-03-22'},
 ('concerts', 'amid'): {'2020-03-22'},
 ('amid', 'rising'): {'2020-03-22'},
 ('rising', 'covid'): {'2020-03-22'},
 ('covid', 'cases'): {'2020-03-22',
  '2020-03-23',
  '2020-03-24',
  '2020-03-25',
  '2020-03-26',
  '2020-03-27',
  '2020-03-28',
  '2020-03-29'},
 ('cases', 'shock'): {'2020-03-22'},
 ('shock', 'social'): {'2020-03-22'},
 ('social', 'media'): {'2020-03-22',
  '2020-03-23',
  '2020-03-24',
  '2020-03-25',
  '2020-03-27',
  '2020-03-28',
  '2020-03-29'},
 ('media', 'free'): {'2020-03-22'},
 ('free', 'malaysia'): {'2020-03-22'},
 ('malaysia', 'today'): {'2020-03-22'},
 ('today', 'https'): {'2020-03-22',
  '2020-03-23',
  '2020-03-24',
  '2020-03-25',
  '2020-03-26',
  '2020-03-27',
  '2020-03-28'},
 ('https', 't'): {'2020-03-22',
  '2020-03-23',
  '2020-03-24',
  '2020-03-25',
  '2020-03-26',
  '2020-03-27',
  '2020-03-28',
  '2020-03-29'},
 ('t', 'co'): {'2020-03-22',
  '2020-03-23',
  '2020-03-24',
  

#### Remove rarest bigrams

In [20]:
for k in bi_token_dict.keys():
    for i in range(len(bi_token_dict[k])-1,-1,-1):
        if len(doc_freq_dict_bi[bi_token_dict[k][i]]) < 5:
            bi_token_dict[k].pop(i)

In [21]:
bi_token_dict

{'2020-03-22': [('covid', 'cases'),
  ('social', 'media'),
  ('today', 'https'),
  ('https', 't'),
  ('t', 'co'),
  ('co', 'r'),
  ('in', 'the'),
  ('the', 'world'),
  ('to', 'see'),
  ('here', 'https'),
  ('https', 't'),
  ('t', 'co'),
  ('co', 'e'),
  ('https', 't'),
  ('t', 'co'),
  ('https', 't'),
  ('t', 'co'),
  ('co', 'd'),
  ('you', 're'),
  ('wash', 'your'),
  ('your', 'hands'),
  ('at', 'home'),
  ('ability', 'to'),
  ('the', 'house'),
  ('and', 'be'),
  ('be', 'a'),
  ('covid', 'stayathome'),
  ('this', 'is'),
  ('is', 'so'),
  ('stay', 'safe'),
  ('https', 't'),
  ('t', 'co'),
  ('covid', 'in'),
  ('in', 'nigeria'),
  ('we', 'must'),
  ('hand', 'washing'),
  ('social', 'distance'),
  ('of', 'a'),
  ('https', 't'),
  ('t', 'co'),
  ('join', 'the'),
  ('the', 'fight'),
  ('fight', 'against'),
  ('against', 'coronavirus'),
  ('coronavirus', 'i'),
  ('i', 'have'),
  ('the', 'country'),
  ('you', 'can'),
  ('can', 'also'),
  ('take', 'the'),
  ('at', 'https'),
  ('https', 't'),


### Count term frequency in document

In [None]:
## Count frequency

In [22]:
import collections
a = collections.Counter(bi_token_dict['2020-03-29']).most_common(100)

list(a)

[(('https', 't'), 985),
 (('t', 'co'), 985),
 (('the', 'coronavirus'), 63),
 (('in', 'the'), 61),
 (('covid', 'https'), 57),
 (('of', 'the'), 52),
 (('of', 'covid'), 38),
 (('to', 'the'), 37),
 (('to', 'be'), 33),
 (('for', 'the'), 32),
 (('coronavirus', 'https'), 30),
 (('this', 'is'), 30),
 (('is', 'a'), 30),
 (('coronavirus', 'covid'), 25),
 (('of', 'coronavirus'), 25),
 (('on', 'the'), 23),
 (('will', 'be'), 22),
 (('the', 'world'), 22),
 (('from', 'the'), 21),
 (('covid', 'coronavirus'), 21),
 (('due', 'to'), 21),
 (('it', 's'), 20),
 (('the', 'covid'), 20),
 (('in', 'a'), 18),
 (('it', 'is'), 18),
 (('and', 'the'), 18),
 (('for', 'covid'), 17),
 (('at', 'home'), 17),
 (('we', 'are'), 17),
 (('coronavirus', 'cases'), 17),
 (('don', 't'), 17),
 (('all', 'the'), 17),
 (('covid', 'covid'), 16),
 (('as', 'a'), 15),
 (('number', 'of'), 15),
 (('that', 'the'), 15),
 (('to', 'covid'), 15),
 (('has', 'been'), 15),
 (('have', 'been'), 15),
 (('should', 'be'), 14),
 (('if', 'you'), 14),
 ((

## Write to bigram 100

In [23]:
f = open('bi100.txt','w')

for k in bi_token_dict.keys():
    most_common = collections.Counter(bi_token_dict[k]).most_common(100)
    
    print(k+':'+str(most_common), file = f)

f.close()

# 4.Unigram tokens

### Count document frequency

In [29]:
doc_freq_dict

{'packed': {'2020-03-22',
  '2020-03-23',
  '2020-03-24',
  '2020-03-26',
  '2020-03-28',
  '2020-03-29'},
 'uk': {'2020-03-22',
  '2020-03-23',
  '2020-03-24',
  '2020-03-25',
  '2020-03-26',
  '2020-03-27',
  '2020-03-28',
  '2020-03-29'},
 'concerts': {'2020-03-22', '2020-03-27'},
 'amid': {'2020-03-22',
  '2020-03-23',
  '2020-03-24',
  '2020-03-25',
  '2020-03-26',
  '2020-03-27',
  '2020-03-28',
  '2020-03-29'},
 'rising': {'2020-03-22',
  '2020-03-23',
  '2020-03-25',
  '2020-03-27',
  '2020-03-29'},
 'covid': {'2020-03-22',
  '2020-03-23',
  '2020-03-24',
  '2020-03-25',
  '2020-03-26',
  '2020-03-27',
  '2020-03-28',
  '2020-03-29'},
 'cases': {'2020-03-22',
  '2020-03-23',
  '2020-03-24',
  '2020-03-25',
  '2020-03-26',
  '2020-03-27',
  '2020-03-28',
  '2020-03-29'},
 'shock': {'2020-03-22', '2020-03-24', '2020-03-25', '2020-03-27'},
 'social': {'2020-03-22',
  '2020-03-23',
  '2020-03-24',
  '2020-03-25',
  '2020-03-26',
  '2020-03-27',
  '2020-03-28',
  '2020-03-29'},
 'me

# 5. Find rare tokens days <5

In [30]:
removed_set_least = set()

for k in doc_freq_dict.keys():
    if len(doc_freq_dict[k]) < 5:
        removed_set_least.add(k)
        
# remove least common token
for k in token_dict.keys():
    for i in range(len(token_dict[k])-1,-1,-1):
        if token_dict[k][i] in removed_set_least:
            token_dict[k].pop(i)

In [30]:
removed_set_least

{'toner',
 'nus',
 'disembarked',
 'tamim',
 'lelenapeacock',
 'suggested',
 'stealing',
 'yox',
 'porters',
 'congrats',
 'forcing',
 'cotton',
 'alise',
 'mokhow',
 'coronaitaly',
 'akngotzykw',
 'curfewinkenya',
 'viaw',
 'exi',
 'spiytrqldu',
 'dream',
 'sub-saharan',
 'davidfrawleyved',
 'hse',
 'piccadillycirus',
 'substation',
 'owhk',
 'cervia',
 'eyvvadgsa',
 'copd',
 'brady',
 'tq',
 'dispatch',
 'accommodate',
 'toadies',
 'amy',
 'azmbmizwiw',
 'facto',
 'amounts',
 'ruthjohn',
 'theoretically',
 'goat',
 'diplomat',
 'eqnkwqjeui',
 'dula',
 'viruscorona',
 'tea',
 'lockthemallup',
 'slightly',
 'hooters',
 'wtrkrgpbzn',
 'mlq',
 'yjqiqe',
 'ywsyh',
 'jczioexb',
 'yossigestetner',
 'assigned',
 'jimsciutto',
 'gdixon',
 'sentiments',
 'ygoh',
 'wupomkohrw',
 'ugqnd',
 'sweeper',
 'authgfwj',
 'nifty',
 'malign',
 'secpompeo',
 'punuan',
 'kl',
 'cuyvxqevr',
 'oeyscubdp',
 'trumpscuecards',
 'jiimpn',
 'healthtech',
 'playstation',
 'ldecmdapvd',
 'wnv',
 'exports',
 'firstl

In [31]:
# remove rare tokens
for k in token_dict.keys():
    for i in range(len(token_dict[k])-1,-1,-1):
        if token_dict[k][i] in removed_set_least:
            token_dict[k].pop(i)

# 6.Remove token with length less than 3

In [32]:
# remove most common and rare token
for k in token_dict.keys():
    for i in range(len(token_dict[k])-1,-1,-1):
        if len(token_dict[k][i]) < 3:
            token_dict[k].pop(i)

# 7. Stemming

In [33]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [34]:
for k in token_dict.keys():
    for i in range(len(token_dict[k])-1,-1,-1):
        token_dict[k][i] = stemmer.stem(token_dict[k][i])

# 8. Output

### Flatten token list to get vocab dict

In [36]:
set(['a']).union(set(['b']))

{'a', 'b'}

In [None]:
# add bigram to vocab

In [157]:
# bi_token_set.add(('adesso', 'hover'))
# bi_token_set

{('adesso', 'hover')}

In [35]:
vocab = sorted(list(set(chain.from_iterable(token_dict.values()))))
#vocab = sorted(list(set(chain.from_iterable(token_dict.values())).union(bi_token_set)))

In [36]:
vocab_dict = {}

for i in range(len(vocab)):
    vocab_dict[vocab[i]] = i

In [37]:
vocab_dict

{'abc': 0,
 'abil': 1,
 'abroad': 2,
 'absolut': 3,
 'abt': 4,
 'abus': 5,
 'accept': 6,
 'access': 7,
 'account': 8,
 'accur': 9,
 'act': 10,
 'action': 11,
 'activ': 12,
 'activist': 13,
 'actual': 14,
 'ad': 15,
 'adapt': 16,
 'add': 17,
 'addit': 18,
 'address': 19,
 'administr': 20,
 'admit': 21,
 'adopt': 22,
 'advantag': 23,
 'advertis': 24,
 'advic': 25,
 'affect': 26,
 'afraid': 27,
 'africa': 28,
 'african': 29,
 'afternoon': 30,
 'age': 31,
 'agenc': 32,
 'agenda': 33,
 'ago': 34,
 'agre': 35,
 'ahead': 36,
 'aid': 37,
 'air': 38,
 'airlin': 39,
 'airport': 40,
 'alcohol': 41,
 'alert': 42,
 'aliv': 43,
 'allah': 44,
 'allegedli': 45,
 'allow': 46,
 'almighti': 47,
 'amaz': 48,
 'amazon': 49,
 'ambul': 50,
 'america': 51,
 'american': 52,
 'amici': 53,
 'amid': 54,
 'amidst': 55,
 'amitshah': 56,
 'amount': 57,
 'amp': 58,
 'analysi': 59,
 'analyt': 60,
 'android': 61,
 'ang': 62,
 'angel': 63,
 'ani': 64,
 'anim': 65,
 'announc': 66,
 'answer': 67,
 'anxieti': 68,
 'anymor'

### 8.1 write to vocab.txt

In [38]:
idx = 0

f = open('vocab.txt','w')

for i in range(len(vocab)):
    if type(vocab[i]) == tuple:
        print(vocab[i][0] + '_' + vocab[i][1] + ':'+str(idx), file = f)
    else:
        print(vocab[i]+':'+str(idx), file = f)
    idx+=1

f.close()
    

### 8.2 write to uni100

In [39]:
token_dict

{'2020-03-22': ['pack',
  'amid',
  'rise',
  'covid',
  'case',
  'social',
  'media',
  'free',
  'malaysia',
  'today',
  'http',
  'world',
  'http',
  'http',
  'realdonaldtrump',
  'asshol',
  'call',
  'covid',
  'http',
  'watch',
  'wash',
  'hand',
  'exercis',
  'home',
  'read',
  'damn',
  'book',
  'learn',
  'abil',
  'leav',
  'hous',
  'risk',
  'amp',
  'covid',
  'stayathom',
  'scari',
  'stay',
  'safe',
  'isol',
  'http',
  'covid',
  'nigeria',
  'observ',
  'regular',
  'hand',
  'wash',
  'avoid',
  'crowd',
  'gather',
  'social',
  'distanc',
  'self-isol',
  'abroad',
  'ncdcgov',
  'guidelin',
  'oil',
  'nation',
  'lowest',
  'ga',
  'price',
  'histori',
  'economi',
  'coronaviru',
  'http',
  'join',
  'fight',
  'coronaviru',
  'curfew',
  'keep',
  'countri',
  'safe',
  'http',
  'http',
  'coronaviru',
  'http',
  'happen',
  'nsw',
  'health',
  'announc',
  'today',
  'test',
  'posit',
  'covid',
  'pack',
  'beach',
  'coupl',
  'day',
  'youn

In [39]:
collections.Counter(token_dict['2020-03-22']).most_common(100)

[('http', 935),
 ('coronaviru', 375),
 ('covid', 373),
 ('peopl', 108),
 ('amp', 77),
 ('time', 68),
 ('coronaviruspandem', 65),
 ('test', 60),
 ('friend', 58),
 ('case', 55),
 ('itali', 50),
 ('support', 46),
 ('trump', 43),
 ('home', 42),
 ('spread', 41),
 ('news', 41),
 ('pandem', 40),
 ('india', 40),
 ('viru', 39),
 ('fight', 38),
 ('countri', 38),
 ('make', 38),
 ('famili', 37),
 ('share', 36),
 ('china', 35),
 ('day', 34),
 ('stay', 32),
 ('stop', 32),
 ('govern', 30),
 ('italian', 30),
 ('call', 29),
 ('work', 29),
 ('world', 28),
 ('lockdown', 28),
 ('social', 27),
 ('stand', 27),
 ('realdonaldtrump', 26),
 ('good', 26),
 ('read', 24),
 ('infect', 24),
 ('corona', 24),
 ('respons', 24),
 ('death', 24),
 ('thing', 24),
 ('crisi', 23),
 ('state', 23),
 ('outbreak', 23),
 ('don', 23),
 ('colleagu', 23),
 ('cari', 23),
 ('amici', 23),
 ('siamo', 23),
 ('con', 23),
 ('voi', 23),
 ('westandwithitali', 23),
 ('health', 22),
 ('care', 22),
 ('live', 22),
 ('die', 22),
 ('medic', 22),
 

In [40]:
f = open('uni100.txt','w')

for k in token_dict.keys():
    most_common = collections.Counter(token_dict[k]).most_common(100)
    
    print(k+':'+str(most_common), file = f)

f.close()

### 8.3 Count vectorizor

In [42]:
token_dict['2020-03-22']

['pack',
 'amid',
 'rise',
 'covid',
 'case',
 'social',
 'media',
 'free',
 'malaysia',
 'today',
 'http',
 'world',
 'http',
 'http',
 'realdonaldtrump',
 'asshol',
 'call',
 'covid',
 'http',
 'watch',
 'wash',
 'hand',
 'exercis',
 'home',
 'read',
 'damn',
 'book',
 'learn',
 'abil',
 'leav',
 'hous',
 'risk',
 'amp',
 'covid',
 'stayathom',
 'scari',
 'stay',
 'safe',
 'isol',
 'http',
 'covid',
 'nigeria',
 'observ',
 'regular',
 'hand',
 'wash',
 'avoid',
 'crowd',
 'gather',
 'social',
 'distanc',
 'self-isol',
 'abroad',
 'ncdcgov',
 'guidelin',
 'oil',
 'nation',
 'lowest',
 'ga',
 'price',
 'histori',
 'economi',
 'coronaviru',
 'http',
 'join',
 'fight',
 'coronaviru',
 'curfew',
 'keep',
 'countri',
 'safe',
 'http',
 'http',
 'coronaviru',
 'http',
 'happen',
 'nsw',
 'health',
 'announc',
 'today',
 'test',
 'posit',
 'covid',
 'pack',
 'beach',
 'coupl',
 'day',
 'young',
 'peopl',
 'messag',
 'day',
 'make',
 'crisi',
 'wors',
 'http',
 'world',
 'favor',
 'drop',
 'd

### Transform word to number

In [41]:
token_num_dict = {}

for k in token_dict.keys():
    token_num_dict[k] = []
    
    for i in range(len(token_dict[k])):
        token_num_dict[k].append(vocab_dict[token_dict[k][i]])

In [44]:
token_num_dict

{'2020-03-22': [1031,
  54,
  1237,
  326,
  203,
  1345,
  909,
  586,
  888,
  1481,
  706,
  1613,
  706,
  706,
  1174,
  89,
  187,
  326,
  706,
  1581,
  1575,
  659,
  494,
  695,
  1171,
  357,
  158,
  834,
  1,
  835,
  704,
  1238,
  58,
  326,
  1385,
  1268,
  1383,
  1252,
  772,
  706,
  326,
  987,
  1007,
  1192,
  659,
  1575,
  105,
  341,
  610,
  1345,
  416,
  1290,
  2,
  974,
  653,
  1012,
  969,
  872,
  605,
  1119,
  687,
  453,
  304,
  706,
  791,
  548,
  304,
  348,
  800,
  320,
  1252,
  706,
  706,
  304,
  706,
  662,
  999,
  669,
  66,
  1481,
  1460,
  1094,
  326,
  1031,
  124,
  322,
  363,
  1638,
  1057,
  919,
  363,
  886,
  337,
  1617,
  706,
  1613,
  532,
  437,
  365,
  706,
  747,
  326,
  761,
  1513,
  1468,
  1213,
  363,
  51,
  493,
  142,
  599,
  83,
  58,
  1229,
  706,
  595,
  1311,
  706,
  859,
  322,
  1590,
  109,
  706,
  349,
  1169,
  326,
  737,
  717,
  127,
  1376,
  1247,
  823,
  77,
  1512,
  907,
  562,
  212,

### <font color = 'red'>bigram if any

In [None]:
{'abc_def' : 8, 'a':0, 'b':3}


{'abc_def', 'ghi_jkl', 'iopr_dsfs'}
    
    
    
    
    ghi_jkl

In [1]:
token_num_dict_bi = {}

for k in bi_token_dict.keys():
    token_num_dict_bi[k] = []
    
    for i in range(len(bi_token_dict[k])):
        try:
            token_num_dict_bi[k].append(vocab_dict[bi_token_dict[k][i]])
        except:
            pass

NameError: name 'bi_token_dict' is not defined

In [None]:
762:900

In [47]:
token_num_dict_bi

{'2020-03-22': [],
 '2020-03-23': [],
 '2020-03-24': [],
 '2020-03-25': [],
 '2020-03-26': [],
 '2020-03-27': [],
 '2020-03-28': [],
 '2020-03-29': []}

### <font color = 'red'>combine two num dict

In [None]:
for k in token_num_dict.keys():
    token_num_dict[k].extend(token_num_dict_bi[k])

### count number frequency

In [42]:
from nltk.probability import *
FreqDist(token_num_dict['2020-03-22'])

FreqDist({706: 935, 304: 375, 326: 373, 1057: 108, 58: 77, 1478: 68, 309: 65, 1460: 60, 592: 58, 203: 55, ...})

In [43]:
# import library
from nltk.probability import *

# open file handle
out = open('countVec.txt', 'w')

for k in token_num_dict.keys():
    out.write(k)
    
    q = FreqDist(token_num_dict[k])
    
    for key, value in q.items():
        out.write(','+str(key)+':'+str(value))
    out.write('\n')

out.close()