
Project 2 Part 4
----------------

In [1]:
using JLD
using LinearAlgebra

In [2]:
#Loading relevent data
tdm = load("shake.jld", "tdm")
dict = load("shake.jld", "dictionary")
plays = load("shake.jld", "plays")


33×1 Array{Any,2}:
 "Cymbeline"
 "Love's Labours Lost"
 "Measure for Measure"
 "The Merchant of Venice"
 "A Midsummer Night's Dream"
 "Much Ado About Nothing"
 "Pericles, Prince of Tyre"
 "Taming of the Shrew"
 "The Tempest"
 "Troilus and Cressida"
 "Twelfth Night"
 "Two Gentlemen of Verona"
 "Winter's Tale"
 ⋮
 "Richard II"
 "Richard III"
 "Antony and Cleopatra"
 "Coriolanus"
 "Hamlet"
 "Julius Caesar"
 "King Lear"
 "Macbeth"
 "Othello"
 "Romeo and Juliet"
 "Timon of Athens"
 "Titus Andronicus"

In [3]:
#Displaying the dictionary to more easily select words.
dict

26126×1 Array{Any,2}:
 "aaron"
 "aarons"
 "abaissiez"
 "abandon"
 "abandond"
 "abandoned"
 "abase"
 "abashd"
 "abate"
 "abated"
 "abatement"
 "abatements"
 "abates"
 ⋮
 "zealous"
 "zeals"
 "zed"
 "zenelophon"
 "zenith"
 "zephyrs"
 "zir"
 "zodiac"
 "zodiacs"
 "zone"
 "zounds"
 "zwaggered"

In [4]:
#Query function, used to find the relevence of the documents, based on parameter q
function query(A,q)
  n,m=size(A)
  B=zeros(n,m)
  for j=1:m
    B[:,j]=A[:,j]/norm(A[:,j])
  end
  q=q/norm(q)
  w=B'*q
  return w
end

query (generic function with 1 method)

In [5]:
#This function effectively converts a plain-text search into a query vector
function giveQ(search_terms)
  output = zeros(size(dict)[1])
  for term in search_terms
    index = findfirst(isequal(term),dict)
    if typeof(index) != Nothing
      output[index] = 1
    end
  end
  output
end

giveQ (generic function with 1 method)

In [6]:
#Given the result from the query function, this lets you pull the relevent documents, determined by the threshold parameter. It then uses the plays document to return answers as actual play titles.
function giveRelevent(q, threshold)
  answers = findall(x -> x >= threshold, q)
  plays[answers]
end

giveRelevent (generic function with 1 method)


Parts a and b
-------------

In [7]:
#Using three terms from the dictionary gives the following results
terms1 = ["zenith", "abandoned", "zodiac"]
query1 = query(tdm, giveQ(terms1))
giveRelevent(query1, 0.001)

2-element Array{Any,1}:
 "The Tempest"
 "Titus Andronicus"

In [8]:
#Doing likewise as above, but with five terms. It is worth mentioning that "house" stood out as such a disruptive phrase, the relevence threshold had to be changed to 0.02 in order to not just recieve every single document.
terms2 = ["zealous", "abashed", "abandon", "zone", "house"]
query2 = query(tdm, giveQ(terms2))
giveRelevent(query2, 0.02)

3-element Array{Any,1}:
 "The Merchant of Venice"
 "Taming of the Shrew"
 "Twelfth Night"

Part c
-

In [14]:
#This function returns the rank 10 estimation for our dictionary matrix.The matrix of A'A is only 33x33, so we can safely allow julia to find the eigen vectors and values used for the SVD. However, for AA', we need to manually calculate each of the ten columns being used.
function Rank10Estimate(A)
  ATA = A'A
  
  #SVD components are either evaluated or initialized
  Sigma = zeros(10,10)
  V = eigvecs(ATA)
  U = zeros(26126,10)
  
  #We now iterate to construct both the sigma and U vectors
  EV = (reverse(eigvals(ATA)))
  for i = 1:10
    Sigma[i,i] = sqrt(EV[i])
    U[:,i] = (A*V[:,i])/sqrt(EV[i])
  end
  
  #Matrix V is trimmed down to only have the 10 columns needed for the estimate
  V = V[:,1:10]
  
  #Rank 10 estimate is then calculated and returned
  estimate = U * Sigma * (V')
  return estimate
  
  
end

Rank10Estimate (generic function with 1 method)

In [15]:
#We now obtain our rank 10 estimate to use with our previously shown search terms.
Ak = Rank10Estimate(tdm)

26126×33 Array{Float64,2}:
 -0.569677    0.0365613    0.496025    …  -0.853506    1.05846
 -0.0227871   0.00146245   0.019841       -0.0341403   0.0423384
  0.119773    0.0279226    0.00906437     -0.0654196   0.0353117
 -0.0161837   0.111104     0.00414608      0.236931   -0.0580767
  0.0391666  -0.143915     0.0414479       0.243738   -0.0183725
 -0.0227871   0.00146245   0.019841    …  -0.0341403   0.0423384
  0.0459727   0.0234844   -0.111041       -0.0690157  -0.00184967
  0.0208102  -0.0247816   -0.0132676      -0.0025756  -0.00738667
  1.309       0.377431     0.00948303     -0.325717    0.144288
  0.0673405   0.0940875   -0.0671565      -0.233997   -0.0590507
  0.525702   -0.105889    -0.0470225   …  -0.172676   -0.00804257
  0.0339799  -0.034473     0.0171731       0.0228191   0.0239178
 -0.165178    0.00980017   0.00351649     -0.117529   -0.00205477
  ⋮                                    ⋱              
 -0.240411    0.250847    -0.110659        0.328052   -0.0819269
 -0.057

In [16]:
#Reusing our first set of terms, shown in comments below. Notice that the approximation allows for some more to be returned when it was not before.
#terms1 = ["zenith", "abandoned", "zodiac"]
query3 = query(Ak, giveQ(terms1))
giveRelevent(query3, 0.001)

6-element Array{Any,1}:
 "The Merchant of Venice"
 "The Tempest"
 "Henry VIII"
 "Coriolanus"
 "Hamlet"
 "Titus Andronicus"

In [17]:
#Just like above, terms2 is searched for in this rank10 estimate. In this case, the results have more pronounced differences. Taming of the Shrew is no longer present, and three other plays have now shown. This is likely due to how much the term "house" made many results similar in size, so small deviations are more noticable.
#terms2 = ["zealous", "abashed", "abandon", "zone", "house"]
query4 = query(Ak, giveQ(terms2))
giveRelevent(query4, 0.02)


5-element Array{Any,1}:
 "Measure for Measure"
 "The Merchant of Venice"
 "Twelfth Night"
 "Henry VI, part 3"
 "Richard III"