In [66]:
import math
import re
import heapq

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

# Read line by line, remove any special characters but white space

In [90]:
test = "\\+++|||\n\t104 Nash,,, Hall, Departm$$$ent of ###Fisheri##es and Wildlife; Oregon State University, Corvallis, OR 97331-7501."
re.sub('[^A-Za-z0-9 ]+', '', test)
# re.split(", |; | ", t)
# ''.join(e for e in t if e.isalnum())

'104 Nash Hall Department of Fisheries and Wildlife Oregon State University Corvallis OR 973317501'

In [91]:
filepath = "./final_orgs.txt"
samples = 1000
data = []
with open(filepath) as f:
    while samples > 0:
        data.append(re.sub('[^A-Za-z0-9 ]+', '', (next(f))))
        samples -= 1
# print(data)

# Compute the Tf-Idf table

In [92]:
a = Tf_idf(data)
a.get_tfidf_table()

# Query

In [104]:
# heapq.nlargest(10, a.query("University"))
heapq.nlargest(10, a.query("Georgia Institute of Technology"))
# heapq.nlargest(10, a.query("100 Radiation Center Oregon State University Corvallis"))

[(0.39864781862538884, '10524 California Institute of Technology'),
 (0.3444626099831442,
  '0ffice of Information Teclhnology Georgia Institute of TechnologyTAB'),
 (0.23503090133769758,
  '10524 Robinson Lab California Institute of Technology Pasadena California 91125'),
 (0.09859764475914322,
  '0601 class of Computer Science  Technology Hubei University of Technology'),
 (0.08551410564996555,
  '100 Haiquan Road Dept of Food Science and Technology Shanghai Inst of Technology Shanghai China'),
 (0.08468069944196469,
  '100 Bureau DR National Institute of Standards and Technology Gaithersburg MD 208998662 USA'),
 (0.07763169629565018,
  '100 Bureau Drive Stop 8940 National Institute of Standards  Technology Gaithersburg Maryland 20899TAB'),
 (0.0773660503472803,
  '100 Bureau Drive Stop 8423 National Institute of Standards and Technology Gaithersburg Maryland 208998423 USA'),
 (0.07322102476361969, '101st University Hospital Clinic Tbilisi Georgia'),
 (0.07061979351151958,
  '100 Bur

In [73]:
class Tf_idf(object):
    def __init__(self, train_list):
        self.train = train_list
        self.train_num = len(train_list)
        self.words_bag = set([word for sentence in train_list
                              for word in sentence.lower().split(' ')])
        for sentence in train_list:
            two_gram = self._cal_two_gram(sentence)
            self.words_bag = self.words_bag.union(two_gram)
            three_gram = self._cal_three_gram(sentence)
            self.words_bag = self.words_bag.union(three_gram)

        self.words_count = {word: 0 for word in self.words_bag}
        self.idfs = {}
        self.tfs = []

    def _cal_word_frequency(self):
        for sentence in self.train:
            for word in sentence.lower().split(' '):
                self.words_count[word] += 1
            for word_gram in self._cal_two_gram(sentence):
                self.words_count[word_gram] += 1
            for word_gram in self._cal_three_gram(sentence):
                self.words_count[word_gram] += 1

    def _cal_two_gram(self, sentence):
        word_list = sentence.lower().split(' ')
        two_gram_bag = set()
        for i, _ in enumerate(word_list[:-1]):
            two_gram = ' '.join([word_list[i], word_list[i+1]])
            two_gram_bag.add(two_gram)
        return two_gram_bag
    
    def _cal_three_gram(self, sentence):
        word_list = sentence.lower().split(' ')
        three_gram_bag = set()
        for i, _ in enumerate(word_list[:-2]):
            three_gram = ' '.join([word_list[i], word_list[i+1], word_list[i+2]])
            three_gram_bag.add(three_gram)
        return three_gram_bag


    def _cal_tf(self):
        for sentence in self.train:
            word_list = sentence.lower().split(' ')
            word_gram_list = list(self._cal_two_gram(sentence))
            word_list += word_gram_list
            word_gram_list = list(self._cal_three_gram(sentence))
            word_list += word_gram_list
            length = len(word_list)
            word_tf = [1 / length**0.5] * length
            self.tfs.append(dict(zip(word_list, word_tf)))

    def _cal_idf(self):
        self._cal_word_frequency()
        for word in self.words_bag:
            if self.words_count[word] == 0:
                print(word)
                print(self.words_count)
            ratio = self.train_num / self.words_count[word]
            self.idfs[word] = math.log10(ratio)

    def get_tfidf_table(self):
        self._cal_tf()
        self._cal_idf()
        self.tfidfs = self.tfs.copy()
        for i, tf_dict in enumerate(self.tfs):
            for key in tf_dict.keys():
                self.tfidfs[i][key] = tf_dict[key] * self.idfs[key]

    def query(self, sentence):
        word_list = sentence.lower().split(' ')
        word_gram_list = list(self._cal_two_gram(sentence))
        word_list += word_gram_list
        word_gram_list = list(self._cal_three_gram(sentence))
        word_list += word_gram_list
        length = len(word_list)
        word_tf = [1 / length**0.5] * length
        tf_query = dict(zip(word_list, word_tf))
        tfidf_query = {w: tf_query[w] * self.idfs.get(w, 0) for w in word_list}

        similarity_scores = []
        for i, sentence in enumerate(self.tfidfs):
            score = self.similarity(sentence, tfidf_query)
            similarity_scores.append((score, self.train[i]))
        return similarity_scores

    def query_top(self, sentence):
        return np.argsort(self.query(sentence))

    def similarity(self, tfidf1, tfidf2):
        dot_value = 0
        for word in tfidf2:
            if word not in tfidf1:
                continue
            dot_value += tfidf1[word] * tfidf2[word]
        l1 = sum([i**2 for i in tfidf1.values()])**0.5
        l2 = sum([i**2 for i in tfidf2.values()])**0.5
        return dot_value / (l1 * l2)