In [1]:
def read_file (filename):
    '''
    read the file and organize it's data into a list
    '''
    words = []
    with open(filename, "r") as fp:
        for line in fp:
            if(len(line) > 1): #not an empty line
                word,tag = line.split() #stores the word separately from tag
                words.append(word)

    return words

In [2]:
wordlist = read_file("gene.train")

In [3]:
len(wordlist)

386200

In [4]:
def get_count (words):
    '''
    get a count of each word in the list of words
    '''
    counts = {}

    for word in words:
        if word in counts:
                counts[word] += 1

    return counts

In [7]:
freqlist = get_count(wordlist)

In [9]:
def replace_words (filename, outfile, freq):
    '''
    replace the word in the doc with _RARE_ if count is < 5
    '''
    output_ = []
    with open(filename, "r") as fp:
        for line in fp:
            if(len(line)>1):
                word,tag = line.split() #read the words separately
                print(word)

In [107]:
def simple_gene_tagger(counts_file, dev_file):

    emissions = {}
    counts = {}

    with open(counts_file, "r") as fp:
        for line in fp:
            segment = line.split()

            if(segment[1]=="WORDTAG"): #doing WORDTAG section
                if(segment[3] == "GENE"):
                    emissions[segment[2]] = (segment[0],"GENE") #store word with emission count
                elif(segment[3] == "NOGENE"): #_RARE_ is NOGENE in our case so we override
                    emissions[segment[2]] = (segment[0],"NOGENE") #store word with emission count

            elif(segment[1]=="1-GRAM"): #doing 1-GRAM section
                if(segment[3] == "GENE"):
                    counts[segment[2]] = (segment[0],"GENE")
                    if segment[2] == "_RARE_": #exception
                        counts[segment[2]+" GENE"] = (segment[0], "GENE")
                elif(segment[3] == "NOGENE"):
                    counts[segment[2]] = (segment[0], "NOGENE")
                    if segment[2] == "_RARE_": #exception
                        counts[segment[2]+" NOGENE"] = (segment[0], "NOGENE")

            else: #we're not doing anything beyond 1-GRAM
                #patch up _RARE_ counts
                counts["_RARE_"] = int(counts["_RARE_ NOGENE"][0]) + int(counts["_RARE_ GENE"][0])
                break

    #read dev file and tag
    tags = []
    reversi = {"NOGENE":"GENE", "GENE":"NOGENE"}
    with open(dev_file, "r") as devfp:
        
        for line in devfp:
            line = line.split("\n")
            line = line[0]
            if(line in emissions):
                prob = int(emissions[line][0])/int(counts[line][0])
                if(prob > 1-prob):#this means prob outweighs
                    tags.append(line + " " + emissions[line][1]+"\n") #proper tag
                else: #means other outweighs
                    tags.append(line + " " + reversi[emissions[line][1]]) #reverses the tag
            else: #_RARE_ word (tm)
                prob = int(emissions["_RARE_"][0])/int(counts["_RARE_"])
                tags.append(line + " " + "NOGENE"+"\n") #_RARE_ defaults NOGENE due to our numbers
                
    with open("gene.dev.p1.out", "w") as devp1out:
        devp1out.writelines(tags)

In [108]:
simple_gene_tagger("gene_rare.counts", "gene.dev")

In [85]:
line = "test\n"
line.split("\n")
print(line)

test



In [63]:
#Ning
#14 October 2019
#Linguistics TP3

def simple_gene_tagger(counts_file, dev_file):

    emissions_gene = {}
    emissions_nogene = {}
    counts = {}

    with open(counts_file, "r") as fp:
        for line in fp:
            segment = line.split()

            if(segment[1]=="WORDTAG"): #doing WORDTAG section
                if(segment[3] == "GENE"):
                    emissions_gene[segment[2]] = float(segment[0])
                elif(segment[3] == "NOGENE"):
                    emissions_nogene[segment[2]] = float(segment[0])

            elif(segment[1]=="1-GRAM"): #doing 1-GRAM section
                if segment[2] in counts:
                    counts[segment[2]] += float(segment[0])
                else:
                    counts[segment[2]] = float(segment[0])

            else: #we're not doing anything beyond 1-GRAM
                break

    #read dev file and tag
    tags = []
    with open(dev_file, "r") as devfp:

        for line in devfp:
            line = line.split("\n")
            line = line[0]
            if(len(line)>0):

                if(line in emissions_gene and line in emissions_nogene):
                    prob_gene = emissions_gene[line]/counts[line]
                    prob_nogene = emissions_nogene[line]/counts[line]
                    if(prob_gene > prob_nogene):
                        tags.append(line + " " + "GENE"+"\n") #proper tag
                    else:
                        tags.append(line + " " + "NOGENE"+"\n") #reverses the tag

                elif(line in emissions_gene):
                    tags.append(line + " " + "GENE"+"\n") #proper tag

                elif(line in emissions_nogene):
                    tags.append(line + " " + "NOGENE"+"\n") #proper tag

                else: #_RARE_ word (tm)
                    #prob = float(emissions["_RARE_"][0])/float(counts["_RARE_"])
                    #always output max and NOGENE > GENE
                    tags.append(line + " " + "NOGENE"+"\n")
            else:
                tags.append("\n")

    with open("gene.dev.p1.out", "w") as devp1out:
        devp1out.writelines(tags)

if __name__ == "__main__":
    simple_gene_tagger("gene_rare.counts", "gene.dev")


In [64]:
simple_gene_tagger("gene_rare.counts", "gene.dev")

In [2]:
#Ning
#14 October 2019
#Linguistics TP3

def simple_gene_tagger(counts_file, dev_file):

    emissions_gene = {}
    emissions_nogene = {}
    counts = {}

    with open(counts_file, "r") as fp:
        for line in fp:
            segment = line.split()

            if(segment[1]=="WORDTAG"): #doing WORDTAG section
                if(segment[3] == "GENE"):
                    emissions_gene[segment[2]] = float(segment[0])
                elif(segment[3] == "NOGENE"):
                    emissions_nogene[segment[2]] = float(segment[0])

            elif(segment[1]=="1-GRAM"): #doing 1-GRAM section
                if segment[2] in counts:
                    counts[segment[2]] += float(segment[0])
                else:
                    counts[segment[2]] = float(segment[0])

            else: #we're not doing anything beyond 1-GRAM
                break

    print(emissions_gene.items())
    #read dev file and tag
    tags = []
    with open(dev_file, "r") as devfp:

        for line in devfp:
            line = line.split("\n")
            line = line[0]
            if(len(line)>0):

                if(line in emissions_gene and line in emissions_nogene): #in both, i.e: "a"
                    prob_gene = emissions_gene[line]/counts[line]
                    prob_nogene = emissions_nogene[line]/counts[line]
                    if(max(prob_gene, prob_nogene) == prob_gene):
                        tags.append(line + " " + "GENE"+"\n") #proper tag
                    else:
                        tags.append(line + " " + "NOGENE"+"\n") #reverses the tag

                elif(line in emissions_gene):
                    tags.append(line + " " + "GENE"+"\n") #proper tag

                elif(line in emissions_nogene):
                    tags.append(line + " " + "NOGENE"+"\n") #proper tag

                else: #_RARE_ word (tm)
                    #always output max and NOGENE > GENE for _RARE_
                    tags.append(line + " " + "NOGENE"+"\n")
            else:
                tags.append("\n")

    with open("gene.dev.p1.out", "w") as devp1out:
        devp1out.writelines(tags)

if __name__ == "__main__":
    simple_gene_tagger("gene_rare.counts", "gene.dev")


dict_items([('alkaline', 28.0), ('phosphatases', 9.0), ('5', 110.0), ('-', 4395.0), ('_RARE_', 9515.0), ('HMG', 23.0), ('Serum', 13.0), ('gamma', 92.0), ('secretory', 3.0), ('antibodies', 49.0), ('lipase', 16.0), ('HLA', 23.0), ('phosphatase', 65.0), ('glutamyl', 6.0), ('alpha', 370.0), ('IgG', 46.0), ('immunoglobulin', 33.0), ('E', 65.0), ('esterase', 11.0), ('fibrinogen', 23.0), ('fibrin', 10.0), ('FR', 4.0), ('antigen', 59.0), ('human', 292.0), ('albumin', 27.0), ('amniotic', 1.0), ('fluid', 2.0), ('ferritin', 12.0), ('factor', 370.0), ('IX', 7.0), ('2', 373.0), ('1', 879.0), ('plasminogen', 29.0), ('C3', 11.0), ('complement', 7.0), ('degradation', 1.0), ('products', 10.0), ('platelet', 18.0), ('serum', 85.0), ('thrombin', 10.0), ('HindIII', 8.0), ('EcoRI', 8.0), ('ACTH', 19.0), ('MSH', 6.0), ('LH', 14.0), ('acid', 54.0), ('dehydrogenase', 23.0), ('hepatitis', 10.0), ('B', 167.0), ('cytochrome', 24.0), ('oxidase', 27.0), ('peroxidase', 17.0), ('beta', 321.0), ('receptors', 72.0), ('

In [30]:
#Ning
#22 october 2019
#Linguistics TP3

import numpy as np
import math

def setup_dictionaries(countsfile):

    gene_emission = {}
    nogene_emission = {}
    counts_1gram = {}
    counts_2gram = {}
    counts_3gram = {}

    with open(countsfile, "r") as count_file:
        for line in count_file:
            line = line.split() #separate each line of the file to a list

            if(line[1] == "WORDTAG"): #WORDTAG or 1-GRAM or 2-GRAM or 3-GRAM
                if(line[2] == "GENE"): #emission for WORDTAG was GENE
                    gene_emission[line[3]] = float(line[0]) #line[3] is the word, and line[0] is the count
                else:
                    nogene_emission[line[3]] = float(line[0]) #otherwise tag is NOGENE

            elif(line[1] == "1-GRAM"):
                counts_1gram[line[2]] = float(line[0])

            elif(line[1] == "2-GRAM"): #line[3] and line[4] are emissions
                counts_2gram[line[2]+" "+line[3]] = float(line[0])

            elif(line[1] == "3-GRAM"): #the rest are trigrams
                counts_3gram[line[2]+ " "+line[3]+" "+line[4]] = float(line[0])

    return gene_emission, nogene_emission, counts_1gram, counts_2gram, counts_3gram

def setup_wordlist(devfile):

    #open the devfile to get the list of words
    #wordlist = ["*", "*"] #make sure to start with * * tokens
    with open(devfile, "r") as dev_file:
        for line in dev_file:
            line = line.split("\n")
            word = line[0]

            if(len(word) > 0): #not an empty line

                wordlist.append(word)

    #wordlist.append("STOP") #append STOP tag at the end

    return wordlist

def setup_transition_p(bigram, trigram):

    transitions = np.zeros((1,18))

    #possible transitions, these are the order we use 0->17
    transitions[0][0] = math.log2(trigram["* * GENE"]/bigram["* *"])
    transitions[0][1] = math.log2(trigram["* * NOGENE"]/bigram["* *"])
    transitions[0][2] = math.log2(trigram["* GENE NOGENE"]/bigram["* GENE"])
    transitions[0][3] = math.log2(trigram["* GENE GENE"]/bigram["* GENE"])
    transitions[0][4] = math.log2(trigram["* NOGENE GENE"]/bigram["* NOGENE"])
    transitions[0][5] = math.log2(trigram["* NOGENE NOGENE"]/bigram["* NOGENE"])
    transitions[0][6] = math.log2(trigram["GENE NOGENE GENE"]/bigram["GENE NOGENE"])
    transitions[0][7] = math.log2(trigram["GENE NOGENE NOGENE"]/bigram["GENE NOGENE"])
    transitions[0][8] = math.log2(trigram["GENE NOGENE STOP"]/bigram["GENE NOGENE"])
    transitions[0][9] = math.log2(trigram["GENE GENE NOGENE"]/bigram["GENE GENE"])
    transitions[0][10] = math.log2(trigram["GENE GENE GENE"]/bigram["GENE GENE"])
    transitions[0][11] = math.log2(trigram["GENE GENE STOP"]/bigram["GENE GENE"])
    transitions[0][12] = math.log2(trigram["NOGENE GENE NOGENE"]/bigram["NOGENE GENE"])
    transitions[0][13] = math.log2(trigram["NOGENE GENE GENE"]/bigram["NOGENE GENE"])
    transitions[0][14] = math.log2(trigram["NOGENE GENE STOP"]/bigram["NOGENE GENE"])
    transitions[0][15] = math.log2(trigram["NOGENE NOGENE GENE"]/bigram["NOGENE NOGENE"])
    transitions[0][16] = math.log2(trigram["NOGENE NOGENE NOGENE"]/bigram["NOGENE NOGENE"])
    transitions[0][17] = math.log2(trigram["NOGENE NOGENE STOP"]/bigram["NOGENE NOGENE"])

    return transitions

def emission(word, gene, nogene, gram):
    '''
    needs to get the emission probability
    '''

    if word in gene and word in nogene:
        #take the bigger probability
        prob_GENE = math.log2(gene[word]/gram["GENE"])
        prob_NOGENE = math.log2(nogene[word]/gram["NOGENE"])
        choice = max(prob_GENE, prob_NOGENE)
        return choice
    elif word in gene:
        prob_GENE = math.log2(gene[word]/gram["GENE"])
        return prob_GENE
    elif word in nogene:
        prob_NOGENE = math.log2(nogene[word]/gram["NOGENE"])
        return prob_NOGENE
    else:
        prob_GENE = math.log2(gene["_RARE_"]/gram["GENE"])
        prob_NOGENE = math.log2(nogene["_RARE_"]/gram["NOGENE"])
        choice = max(prob_GENE, prob_NOGENE)
        return choice

def setup_trellis(wordlist, tMAT):
    '''
    trellis will be a matrix of tMAT x wordlist that will keep max values
    '''
    I_ = tMAT.shape[1]
    J_ = len(wordlist)

    trellis = np.zeros((I_, J_)) #trellis is a IxJ matrix that will keep our max values

    return trellis


if __name__ == "__main__":

    #set up
    gene, nogene, gram, bigram, trigram = setup_dictionaries("gene_rare.counts")
    wordlist = setup_wordlist("gene.dev")
    tMAT = setup_transition_p(bigram, trigram)
    trellis = setup_trellis(wordlist, tMAT)

    #run tagging
    trellis[:,0] = 0
    for i in range(1, len(wordlist)):
        trellis[:,i] = np.max(trellis[:,i-1]+tMAT.T+emission(wordlist[i], gene, nogene, gram), 1)

    for column in reversed(trellis.T):
        backtracking.append(np.argmax(column))
    backtracking = backtracking[::-1] #reverse the list for proper order
    print(backtracking)

    #output to file


[0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 