# Statistics
This notebooks allows you to obtain some statistics of the dataset.

In [None]:
!pip install pyforest==0.1.1
#https://pypi.org/project/pyforest/0.1.1/
import pyforest  #With this library, you won't need to import more packages!!!


Collecting pyforest==0.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/ca/7a/2280448ba4202604eb3f9e23d9a4fd0ca1473d31aca0a90fdb5f31dd902c/pyforest-0.1.1.tar.gz (3.4MB)
[K     |████████████████████████████████| 3.4MB 6.5MB/s 
[?25hBuilding wheels for collected packages: pyforest
  Building wheel for pyforest (setup.py) ... [?25l[?25hdone
  Created wheel for pyforest: filename=pyforest-0.1.1-py2.py3-none-any.whl size=9213 sha256=e93e1b16570bceb4ee5e560d59e406fd016e03cef9e32bcd258082541327cd2d
  Stored in directory: /root/.cache/pip/wheels/77/f9/78/51500678d6ce472b574216a40cba6c81d1766ee7cc838cce3c
Successfully built pyforest
Installing collected packages: pyforest
Successfully installed pyforest-0.1.1


In [2]:
#from google.colab import drive
#drive.mount("/content/drive/")

#root='drive/My Drive/Colab Notebooks/nlp4rareNER/'
#path=root+'data/goldstandard/'

path = '../corpus/brat/'

train_path=path+'train/'
dev_path=path+'dev/'
test_path=path+'test/'

print("training:", train_path)
print("dev:", dev_path)
print("test:", test_path)

Mounted at /content/drive/
training: drive/My Drive/Colab Notebooks/nlp4rareNER/data/goldstandard/train/
dev: drive/My Drive/Colab Notebooks/nlp4rareNER/data/goldstandard/dev/
test: drive/My Drive/Colab Notebooks/nlp4rareNER/data/goldstandard/test/


## Number of documents, sentences and tokens

In [3]:
import glob
size_train=len(glob.glob(train_path+'*.txt'))
print('number of documents in training:',size_train)
size_dev=len(glob.glob(dev_path+'*.txt'))
print('number of documents in development:',size_dev)
size_test=len(glob.glob(test_path+'*.txt'))
print('number of documents in test:',size_test)
total=size_train+size_dev+size_test
print('total of documents:',total)


number of documents in training: 729
number of documents in development: 104
number of documents in test: 209
total of documents: 1042


We will install Spacy, a library NLP, that helps us to tokenize the texts.

In [4]:
!python -m spacy download en_core_web_sm #install spacy and download the model en_core_web_sm
import spacy #NLP library for sentence segmentation and tokenization
nlp = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [None]:
def getText(path_file):
    """returns the text of the file path_file"""
    f=open(path_file,'r')
    text=f.read()
    f.close()
    return text

def countTokens(path):
    """This functions takes a directory path where a dataset is stored, g
    gets its txt files to parse them  by using Spacy 
    for obtaining the number of tokens and sentences in
    the dataset"""
    
    total_sentences=0
    total_tokens=0
    files=glob.glob(path+'*.txt')
    #print(path,len(files))
    
    for i,file_text in enumerate(files):
        text=getText(file_text)
        #print(file_text,text)

        doc = nlp(text)
        total_sentences+=len(list(doc.sents))
        total_tokens+=len(doc)

    return total_sentences, total_tokens

sentences_train, tokens_train = countTokens(train_path)
sentences_dev, tokens_dev = countTokens(dev_path)
sentences_test, tokens_test = countTokens(test_path)

print('Sentences and tokens in training: ', sentences_train, tokens_train)
print('Sentences and tokens in dev: ', sentences_dev, tokens_dev)
print('Sentences and tokens in text: ', sentences_test, tokens_test)


## Number of entities

We now obtain the number of entities:

In [None]:


def getAnnotations(path_file,allTypes=False):
    """returns a dataframe with its annotations"""
    annotations=[]
    try:  
        f=open(path_file,'r')
        annotations = f.readlines()       
        f.close()
    except:
        pass

    num_entities=0
    total_dis=0
    total_raredis=0
    total_skin=0
    total_sym=0
    total_sign=0        


    for ann in annotations:
        if ann.startswith('T'):
            idEnt=ann[:ann.index('\t')]
            typeEnt=ann[ann.index('\t')+1:ann.index(' ')]
            num_entities+=1

            if typeEnt=='DISEASE':
                if allTypes:
                    total_dis+=1
                else:
                    num_entities-=1
            elif typeEnt=='RAREDISEASE':
                total_raredis+=1
            elif typeEnt=='SKINRAREDISEASE':
                total_skin+=1
            elif typeEnt=='SYMPTOM':
                total_sym+=1
            elif typeEnt=='SIGN':
                total_sign+=1
            else:
                #if allTypes==False
                pass

    return num_entities, total_dis, total_raredis, total_skin, total_sym, total_sign


def countEntities(path):
    total_entities=0
    total_dis=0
    total_rare=0
    total_skin=0
    total_sym=0
    total_sign=0

    files=glob.glob(path+'*.ann')
    #print(path,len(files))
    
    for i,file_path in enumerate(files):
        ne, td, tr, tsk, tsy, tsi=getAnnotations(file_path)
        total_entities+=ne
        total_dis+=td
        total_rare+=tr
        total_skin+=tsk
        total_sym+=tsy
        total_sign+=tsi

    return total_entities, total_dis, total_rare, total_skin, total_sym, total_sign



print('Entities,Diseases,Rare Diseases, Skin Rare Diseases, Symptoms, Signs')
total_entities, total_dis, total_rare, total_skin, total_sym, total_sign= countEntities(train_path)
print(total_entities, total_dis, total_rare, total_skin, total_sym, total_sign)
total_entities, total_dis, total_rare, total_skin, total_sym, total_sign = countEntities(dev_path)
print(total_entities, total_dis, total_rare, total_skin, total_sym, total_sign)

total_entities, total_dis, total_rare, total_skin, total_sym, total_sign = countEntities(test_path)
print(total_entities, total_dis, total_rare, total_skin, total_sym, total_sign)


Entities,Diseases,Rare Diseases, Skin Rare Diseases, Symptoms, Signs
8774 0 3157 451 318 3742
1228 0 480 45 24 528
2478 0 942 146 53 1061
