In [1]:
def read_file (filename):
    '''
    read the file and organize it's data into a list
    '''
    words = []
    with open(filename, "r") as fp:
        for line in fp:
            if(len(line) > 1): #not an empty line
                word,tag = line.split() #stores the word separately from tag
                words.append(word)

    return words

In [2]:
wordlist = read_file("gene.train")

In [3]:
len(wordlist)

386200

In [4]:
def get_count (words):
    '''
    get a count of each word in the list of words
    '''
    counts = {}

    for word in words:
        if word in counts:
                counts[word] += 1

    return counts

In [7]:
freqlist = get_count(wordlist)

In [9]:
def replace_words (filename, outfile, freq):
    '''
    replace the word in the doc with _RARE_ if count is < 5
    '''
    output_ = []
    with open(filename, "r") as fp:
        for line in fp:
            if(len(line)>1):
                word,tag = line.split() #read the words separately
                print(word)

In [107]:
def simple_gene_tagger(counts_file, dev_file):

    emissions = {}
    counts = {}

    with open(counts_file, "r") as fp:
        for line in fp:
            segment = line.split()

            if(segment[1]=="WORDTAG"): #doing WORDTAG section
                if(segment[3] == "GENE"):
                    emissions[segment[2]] = (segment[0],"GENE") #store word with emission count
                elif(segment[3] == "NOGENE"): #_RARE_ is NOGENE in our case so we override
                    emissions[segment[2]] = (segment[0],"NOGENE") #store word with emission count

            elif(segment[1]=="1-GRAM"): #doing 1-GRAM section
                if(segment[3] == "GENE"):
                    counts[segment[2]] = (segment[0],"GENE")
                    if segment[2] == "_RARE_": #exception
                        counts[segment[2]+" GENE"] = (segment[0], "GENE")
                elif(segment[3] == "NOGENE"):
                    counts[segment[2]] = (segment[0], "NOGENE")
                    if segment[2] == "_RARE_": #exception
                        counts[segment[2]+" NOGENE"] = (segment[0], "NOGENE")

            else: #we're not doing anything beyond 1-GRAM
                #patch up _RARE_ counts
                counts["_RARE_"] = int(counts["_RARE_ NOGENE"][0]) + int(counts["_RARE_ GENE"][0])
                break

    #read dev file and tag
    tags = []
    reversi = {"NOGENE":"GENE", "GENE":"NOGENE"}
    with open(dev_file, "r") as devfp:
        
        for line in devfp:
            line = line.split("\n")
            line = line[0]
            if(line in emissions):
                prob = int(emissions[line][0])/int(counts[line][0])
                if(prob > 1-prob):#this means prob outweighs
                    tags.append(line + " " + emissions[line][1]+"\n") #proper tag
                else: #means other outweighs
                    tags.append(line + " " + reversi[emissions[line][1]]) #reverses the tag
            else: #_RARE_ word (tm)
                prob = int(emissions["_RARE_"][0])/int(counts["_RARE_"])
                tags.append(line + " " + "NOGENE"+"\n") #_RARE_ defaults NOGENE due to our numbers
                
    with open("gene.dev.p1.out", "w") as devp1out:
        devp1out.writelines(tags)

In [108]:
simple_gene_tagger("gene_rare.counts", "gene.dev")

In [85]:
line = "test\n"
line.split("\n")
print(line)

test



In [63]:
#Ning
#14 October 2019
#Linguistics TP3

def simple_gene_tagger(counts_file, dev_file):

    emissions_gene = {}
    emissions_nogene = {}
    counts = {}

    with open(counts_file, "r") as fp:
        for line in fp:
            segment = line.split()

            if(segment[1]=="WORDTAG"): #doing WORDTAG section
                if(segment[3] == "GENE"):
                    emissions_gene[segment[2]] = float(segment[0])
                elif(segment[3] == "NOGENE"):
                    emissions_nogene[segment[2]] = float(segment[0])

            elif(segment[1]=="1-GRAM"): #doing 1-GRAM section
                if segment[2] in counts:
                    counts[segment[2]] += float(segment[0])
                else:
                    counts[segment[2]] = float(segment[0])

            else: #we're not doing anything beyond 1-GRAM
                break

    #read dev file and tag
    tags = []
    with open(dev_file, "r") as devfp:

        for line in devfp:
            line = line.split("\n")
            line = line[0]
            if(len(line)>0):

                if(line in emissions_gene and line in emissions_nogene):
                    prob_gene = emissions_gene[line]/counts[line]
                    prob_nogene = emissions_nogene[line]/counts[line]
                    if(prob_gene > prob_nogene):
                        tags.append(line + " " + "GENE"+"\n") #proper tag
                    else:
                        tags.append(line + " " + "NOGENE"+"\n") #reverses the tag

                elif(line in emissions_gene):
                    tags.append(line + " " + "GENE"+"\n") #proper tag

                elif(line in emissions_nogene):
                    tags.append(line + " " + "NOGENE"+"\n") #proper tag

                else: #_RARE_ word (tm)
                    #prob = float(emissions["_RARE_"][0])/float(counts["_RARE_"])
                    #always output max and NOGENE > GENE
                    tags.append(line + " " + "NOGENE"+"\n")
            else:
                tags.append("\n")

    with open("gene.dev.p1.out", "w") as devp1out:
        devp1out.writelines(tags)

if __name__ == "__main__":
    simple_gene_tagger("gene_rare.counts", "gene.dev")


In [64]:
simple_gene_tagger("gene_rare.counts", "gene.dev")