# Building dictionary frequency

This code comes from Erick Velazquez.

It builds a counter dictionary with the raw frequency of the tokens and lemmas from text documents. Before executing this code, first run this script: TreetaggerAnnotation_Wiki_MedCon.py



In [2]:
import mysql.connector
import requests
import json
import time
import argparse
import logging
import sys
import pandas as pd
from collections import Counter
import pickle
from tqdm import tqdm

In [7]:
punctuationMars = [',','.',';',':','(','-',')','{','}','[',']','\\','/','!','?','<','>','"']
def getTokensStrings(stringAnnotated):
    tokens = [] 
    POS = [] 
    lemmas  = []
    for line in str(stringAnnotated).split('\n'):
        #print(line)
        line = line.lower()
        s = line.split('\t')
        if s[0] not in punctuationMars and len(s)==3:
            tokens.append(s[0])
            POS.append(s[1])
            lemmas.append(s[2])
            
        #print(s.split('\t'))
    return tokens,POS,lemmas

In [8]:
annotated_data = pd.read_csv('Wiki_MedCon_TreetaggerAnnotation.csv', sep=";")
annotated_data

Unnamed: 0,title_annotated,text_annotated
0,Autism\tNN1\tautism,Autism\tNN1\tautism\nis\tVBZ\tbe\na\tAT0\ta\nd...
1,Motor\tNN1\tmotor\nneuron\tNN1\tneuron\ndiseas...,Motor\tNN1\tmotor\nneuron\tNN1\tneuron\ndiseas...
2,Acute\tAJ0\tacute\ndisseminated\tAJ0\tdissemin...,Acute\tAJ0\tacute\ndisseminated\tAJ0\tdissemin...
3,Ataxia\tNN1\tataxia,Ataxia\tNN1\tataxia\nis\tVBZ\tbe\na\tAT0\ta\nn...
4,Abscess\tNN1\tabscess,An\tAT0\tan\nabscess\tNN1\tabscess\nis\tVBZ\tb...
...,...,...
8672,Galactose\tVVB\tgalactose\nepimerase\tNN1\tepi...,Galactose\tVVB\tgalactose\nepimerase\tNN1\tepi...
8673,Little\tAJ0\tlittle\nLeague\tNN1\tleague\nelbo...,Little\tAJ0\tlittle\nLeague\tNN1\tleague\nelbo...
8674,Penetrating\tAJ0\tpenetrating|penetrate\ntraum...,Penetrating\tAJ0\tpenetrating|penetrate\ntraum...
8675,Camptodactyly\tNP0\tCamptodactyly,Camptodactyly\tNP0\tCamptodactyly\nis\tVBZ\tbe...


In [9]:
annotated_data.text_annotated[6482] #6461 #6482

'Umbilical-urachal\tAJ0\tUmbilical-urachal\nsinus\tNN1\tsinus\nis\tVBZ\tbe\na\tAT0\ta\ncongenital\tAJ0\tcongenital\ndisorder\tNN1\tdisorder\nof\tPRF\tof\nthe\tAT0\tthe\nurinary\tAJ0\turinary\nbladder\tNN1\tbladder\ncaused\tVVN\tcause\nby\tPRP\tby\nfailure\tNN1\tfailure\nof\tPRF\tof\nobliteration\tNN1\tobliteration\nof\tPRF\tof\nproximal\tAJ0\tproximal\nor\tCJC\tor\ndistal\tAJ0\tdistal\npart\tNN1\tpart\nof\tPRF\tof\nthe\tAT0\tthe\nallantois\tNN1\tallantois\n,\tPUN\t,\nand\tCJC\tand\nthe\tAT0\tthe\npresentation\tNN1\tpresentation\nof\tPRF\tof\nthis\tDT0\tthis\nanomaly\tNN1\tanomaly\nis\tVBZ\tbe\nmore\tAV0\tmore\ncommon\tAJ0\tcommon\nin\tPRP\tin\nchildren\tNN2\tchild\nand\tCJC\tand\nrarer\tAJC\trare\nin\tPRP\tin\nadults\tNN2\tadult\n.\tSENT\t.\nIt\tPNP\tit\nis\tVBZ\tbe\nthought\tVVN\tthink\nhave\tVHB\thave\nbeen\tVBN\tbe\nfirst\tORD\tfirst\ndescribed\tVVN\tdescribe|described\nby\tPRP\tby\nCabriolus\tNP0\tCabriolus\nin\tPRP\tin\n1550.\tCRD\t@ord@\nInfection\tNN1\tinfection\n,\tPUN\t,\nwith

In [10]:
couter_tokens = Counter()
counter_lemmas = Counter()
counter_pos = Counter()

# with open('Wiki_MedCon_TreetaggerAnnotation.csv') as f:
# for chunk in annotated_data:
#         display(chunk)
title = annotated_data['title_annotated'].to_list()
text = annotated_data['text_annotated'].to_list()

for ti in tqdm(title):
    to,po,le = getTokensStrings(ti)
    couter_tokens.update(to)
    counter_pos.update(po)
    counter_lemmas.update(le)

for te in tqdm(text):
    to,po,le = getTokensStrings(te)
    couter_tokens.update(to)
    counter_pos.update(po)
    counter_lemmas.update(le)

pickle.dump(couter_tokens, open("counter_Tokens.p", "wb"))
pickle.dump(counter_pos, open("counter_POS.p", "wb"))
pickle.dump(counter_lemmas, open("counter_Lemmas.p", "wb"))

100%|██████████| 8677/8677 [00:00<00:00, 94808.82it/s]
100%|██████████| 8677/8677 [00:11<00:00, 761.50it/s] 


In [15]:
print(len(couter_tokens))
couter_tokens.most_common()

166557


[('the', 471282),
 ('of', 339325),
 ('and', 246131),
 ('in', 213046),
 ('to', 193973),
 ('a', 177881),
 ('is', 165293),
 ('with', 96718),
 ('or', 87206),
 ('as', 77064),
 ('are', 74482),
 ('be', 70263),
 ('that', 66381),
 ('for', 66162),
 ('by', 54350),
 ('may', 51461),
 ('it', 47210),
 ('can', 44736),
 ('an', 40650),
 ('this', 39477),
 ('have', 38132),
 ('on', 37736),
 ('from', 36484),
 ('not', 35478),
 ('which', 30190),
 ('disease', 29096),
 ('also', 28927),
 ('%', 27524),
 ('has', 25094),
 ('at', 24235),
 ('other', 24070),
 ('symptoms', 23123),
 ("'s", 23007),
 ('been', 22910),
 ('syndrome', 22786),
 ('such', 22323),
 ('more', 21836),
 ('was', 21145),
 ('but', 20914),
 ('treatment', 20401),
 ('most', 19392),
 ('there', 19351),
 ('patients', 18656),
 ('blood', 18005),
 ('these', 17900),
 ('–', 17690),
 ('than', 17261),
 ('people', 17195),
 ('cases', 17138),
 ('some', 16997),
 ('cause', 16878),
 ('if', 16429),
 ('one', 16216),
 ('when', 15409),
 ('include', 15396),
 ('they', 14987),
 

In [16]:
print(len(counter_lemmas))
counter_lemmas.most_common()

151476


[('the', 471282),
 ('be', 374139),
 ('of', 339325),
 ('and', 246131),
 ('in', 213056),
 ('to', 194037),
 ('a', 177881),
 ('with', 96718),
 ('or', 87221),
 ('as', 77064),
 ('have', 72802),
 ('that', 66384),
 ('for', 66162),
 ('it', 54784),
 ('by', 54352),
 ('may', 51461),
 ('can', 45101),
 ('an', 40661),
 ('this', 39477),
 ('not', 38659),
 ('cause', 38126),
 ('on', 37738),
 ('from', 36484),
 ('disease', 34446),
 ('they', 31924),
 ('which', 30190),
 ('also', 28927),
 ('%', 27524),
 ('other', 26883),
 ('patient', 25550),
 ('symptom', 25056),
 ('at', 24235),
 ('syndrome', 23999),
 ('treatment', 23573),
 ('such', 22323),
 ('disorder', 22076),
 ('case', 21898),
 ('more', 21840),
 ('but', 20914),
 ("'s", 20439),
 ('cell', 20171),
 ('most', 19392),
 ('there', 19351),
 ('blood', 18011),
 ('these', 17900),
 ('include', 17891),
 ('–', 17690),
 ('occur', 17636),
 ('do', 17566),
 ('than', 17261),
 ('people', 17258),
 ('some', 16997),
 ('condition', 16704),
 ('one', 16665),
 ('if', 16430),
 ('when',

In [7]:
with open('counter_Tokens.p', 'rb') as f:
        count = pickle.load(f)

In [8]:
print(len(count))
count.most_common()

166557


[('the', 471282),
 ('of', 339325),
 ('and', 246131),
 ('in', 213046),
 ('to', 193973),
 ('a', 177881),
 ('is', 165293),
 ('with', 96718),
 ('or', 87206),
 ('as', 77064),
 ('are', 74482),
 ('be', 70263),
 ('that', 66381),
 ('for', 66162),
 ('by', 54350),
 ('may', 51461),
 ('it', 47210),
 ('can', 44736),
 ('an', 40650),
 ('this', 39477),
 ('have', 38132),
 ('on', 37736),
 ('from', 36484),
 ('not', 35478),
 ('which', 30190),
 ('disease', 29096),
 ('also', 28927),
 ('%', 27524),
 ('has', 25094),
 ('at', 24235),
 ('other', 24070),
 ('symptoms', 23123),
 ("'s", 23007),
 ('been', 22910),
 ('syndrome', 22786),
 ('such', 22323),
 ('more', 21836),
 ('was', 21145),
 ('but', 20914),
 ('treatment', 20401),
 ('most', 19392),
 ('there', 19351),
 ('patients', 18656),
 ('blood', 18005),
 ('these', 17900),
 ('–', 17690),
 ('than', 17261),
 ('people', 17195),
 ('cases', 17138),
 ('some', 16997),
 ('cause', 16878),
 ('if', 16429),
 ('one', 16216),
 ('when', 15409),
 ('include', 15396),
 ('they', 14987),
 

In [26]:
count.keys()

