In [0]:
# Speech and Language Processing
# Information Retrieval

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

import pandas as pd
from scipy.spatial.distance import cosine

from sklearn.metrics.pairwise import cosine_similarity


def word2vec(word):
    #taken from https://stackoverflow.com/questions/29484529/cosine-similarity-between-two-words-in-a-list
    from collections import Counter
    from math import sqrt

    # count the characters in word
    cw = Counter(word)
    # precomputes a set of the different characters
    sw = set(cw)
    # precomputes the "length" of the word vector
    lw = sqrt(sum(c*c for c in cw.values()))

    # return a tuple
    return cw, sw, lw

def cosdis(v1, v2):
    # which characters are common to the two words?
    common = v1[1].intersection(v2[1])
    # by definition of cosine distance we have
    return sum(v1[0][ch]*v2[0][ch] for ch in common)/v1[2]/v2[2]



def parseAlternatingLinesFile(file):     #-----------------------------
   # read a sequence of pairs of lines, e.g. text of webpage(s), name/URL
   sequenceA = []
   sequenceB = [] 
   fp = open(file, 'r')
   expectingA = True 
   for line in fp.readlines():
       if expectingA:
           sequenceA.append(line.rstrip())
           expectingA = False
       else:
           sequenceB.append(line.rstrip())
           expectingA = True
   fp.close()
   return sequenceA, sequenceB


def characterTrigrams(text):         #----------------------------
  return [text[i:i+3] for i in range(len(text)-3+1)]


def computeFeatures(text, trigramInventory):        #-----------------------------
    # catches the similarities between  "social" and "societal" etc. 
    # but really should be replaced with something better
    finalTokens = {}
    
    
    #REMOVING PUNCTUATION
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    textNo_punct = ""
    for char in text:
        if char not in punctuations:
            textNo_punct = textNo_punct + char        
        
    #UNIGRAMS and REMOVING STOP WORDS
    tokens = word_tokenize(textNo_punct)
    tokensNoStopWords = [w for w in tokens if not w in stop_words]    
    
    ps = PorterStemmer()    
    i=0
    for w in tokensNoStopWords:
        finalTokens[i] = ps.stem(w)
        i=i+1
    
    trigrams = tokensNoStopWords;
    counts = {}
    for trigram in trigrams:
        if trigram in trigramInventory:
            if trigram in counts:
                counts[trigram] += 1
            else:
                counts[trigram] = 1              
    return counts
   

def computeSimilarity(dict1, dict2):   #-----------------------------
    matchCount = 0
    
    similarity=0
    for tri in dict1:
        for tri2 in dict2:
            vw1 = word2vec(tri)
            vw2 = word2vec(tri2)            
            similarity = similarity + cosdis(vw1,vw2)
    return similarity


def retrieve(queries, trigramInventory, archive):      #-----------------------------
    # returns an array: for each query, the top 3 results found
    top3sets = [] 
    for query in queries:
        q = computeFeatures(query, trigramInventory)
        similarities = [] 
        for d in archive:
            similarities.append(computeSimilarity(q, d))
        top3indices = np.argsort(similarities)[0:3]
        #print "top three indices are "
        #print top3indices
        top3sets.append(top3indices)  
    return top3sets

def valueOfSuggestion(result, position, targets):   #-----------------------------
    weight = [1.0, .5, .25]
    if result in targets:
        return weight[max(position, targets.index(result))]
    else:
        return 0


def scoreResults(results, targets):   #-----------------------------
    merits = [valueOfSuggestion(results[i], i, targets) for i in [0,1,2]]
    return sum(merits)


def scoreAllResults(queries, results, targets, descriptor):   #-----------------------------
    print ('\nScores for ' + descriptor)
    scores = [] 
    for q, r, t in zip(queries, results, targets):
       print ('for query: ' + q)
       print (' results = ')
       print (r)
       print (' targets = ')
       print (t)
       s = scoreResults(r, t)
       print ('  score = %.3f' % s)
       scores.append(s)
    overallScore = np.mean(scores)
    print ('all scores')
    print (scores)
    print ('overall score is %.3f' % overallScore)
    return overallScore


def pruneUniqueNgrams(ngrams):        # ----------------------
    twoOrMore = {} 
    print ('before pruning: %d ngrams across all documents' % len(ngrams))
    for key in ngrams:
        if ngrams[key] > 1:
            twoOrMore[key] = ngrams[key]
    print ('after pruning: %d ngrams across all documents' % len(twoOrMore))
    return twoOrMore

def findAllNgrams(contents):          # ----------------------
    allTrigrams = {}
    finalTokens = {}
    merged = ''
    for text in contents:
        #print('++++++++++', text , '===============')
        
        #REMOVING PUNCTUATION
        punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
        textNo_punct = ""
        for char in text:
           if char not in punctuations:
               textNo_punct = textNo_punct + char                        
                
        #UNIGRAMS and REMOVING STOP WORDS
        tokens = word_tokenize(textNo_punct)
        tokensNoStopWords = [w for w in tokens if not w in stop_words]    
        
        ps = PorterStemmer()    
        i=0
        for w in tokensNoStopWords:
            finalTokens[i] = ps.stem(w)
            i=i+1
            #print(ps.stem(w))            
                
        for token in finalTokens:
            if token in allTrigrams:
                allTrigrams[token] += 1
            else:
                allTrigrams[token] = 1
    return allTrigrams


def targetNumbers(targets, nameInventory):        # ----------------------
    # targets is a list of strings, each a sequence of names
    targetIDs = []
    for target in targets:
      threeNumbers = [] 
      for name in target.split():
          threeNumbers.append(nameInventory.index(name))
      targetIDs.append(threeNumbers)
    return targetIDs
          

# main ----------------------------------------------------
import sys, numpy as np

print('......... irStub .........')
contents, names =  parseAlternatingLinesFile('csFaculty.txt') 
print ('read in pages for ')
print (names)
trigramInventory = pruneUniqueNgrams(findAllNgrams(contents))
archive = [computeFeatures(line, trigramInventory) for line in contents]

#queryFile = 'testQueries.txt'
queryFile = 'trainingQueries.txt'

queries, targets = parseAlternatingLinesFile(queryFile)
targetIDs = targetNumbers(targets, names)
results = retrieve(queries, trigramInventory, archive)
modelName = 'silly character trigram model'
scoreAllResults(queries, results, targetIDs, modelName + ' on ' + queryFile)

......... irStub .........
read in pages for 
['akbar', 'badreddin', 'ceberio', 'cheon', 'freudenthal', 'fuentes', 'gates', 'hossain', 'kiekintveld', 'kreinovich', 'longpre', 'novick', 'salamah', 'villanueva', 'ward', 'tosh', 'acosta']
before pruning: 3893 ngrams across all documents
after pruning: 3893 ngrams across all documents

Scores for silly character trigram model on trainingQueries.txt
for query: Artificial Intelligence
 results = 
[ 0 14 13]
 targets = 
[5, 8, 11]
  score = 0.000
for query: AI
 results = 
[ 0 14 13]
 targets = 
[5, 8, 11]
  score = 0.000
for query: ML
 results = 
[ 0 14 13]
 targets = 
[5, 7, 14]
  score = 0.250
for query: Deep Learning
 results = 
[ 0 14 13]
 targets = 
[5, 7, 14]
  score = 0.250
for query: Neural Networks
 results = 
[ 0 14 13]
 targets = 
[5, 7, 14]
  score = 0.250
for query: Social Computing
 results = 
[ 0 14 13]
 targets = 
[13, 6, 0]
  score = 0.500
for query: Internet of Things
 results = 
[ 0 14 13]
 targets = 
[12, 16, 4]
  score = 

0.15441176470588236