In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

In [2]:
documents = [
    "What problems and concerns are there in making up descriptive titles?",
    "What difficulties are involved in automatically retrieving articles from approximate titles?",
    "What is the usual relevance of the content of articles to their titles?",
    "How can actually pertinent data, as opposed to references or entire articles themselves, be retrieved automatically in response to information requests?"
]

In [3]:
# Query to retrieve relevant documents
query = "titles articles"


In [5]:
# Step 1: Convert documents to term-document matrix using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
#Converts the text into a term-document matrix, where each entry represents 
#the importance (weight) of a 
#word in a document.
X = vectorizer.fit_transform(documents)


In [6]:
# Step 2: Apply SVD to reduce dimensions
n_components = 2  # Reduce to 2 latent dimensions
svd = TruncatedSVD(n_components=n_components)
# Reduces the dimensions of the term-document matrix (to 2 components)
#by keeping only the most important "latent" concepts
X_reduced = svd.fit_transform(X)

In [7]:
# Step 3: Transform the query into the reduced space
query_vec = vectorizer.transform([query])
query_reduced = svd.transform(query_vec)
#The query is transformed into the same 
#reduced-dimensional space as the documents.

In [8]:
# Step 4: Calculate cosine similarity between query and documents in reduced space
cosine_similarities = cosine_similarity(query_reduced, X_reduced).flatten()
#cosine_similarity() returns a 2D array (even though it might have only
#one row). 
#By applying .flatten(), we convert this 2D array into a 1D array (vector), 
#where each element corresponds to the similarity score between 
#the query and each document.


In [14]:
# Step 5: Retrieve the top matching documents
top_indices = cosine_similarities.argsort()[::-1] 
# Sort in descending order

In [15]:
print("Query:", query)
print("\nTop matching documents:")
for idx in top_indices:
    print(f"Document {idx + 1}: {documents[idx]} (Similarity: {cosine_similarities[idx]:.4f})")

Query: titles articles

Top matching documents:
Document 3: What is the usual relevance of the content of articles to their titles? (Similarity: 0.9933)
Document 2: What difficulties are involved in automatically retrieving articles from approximate titles? (Similarity: 0.9419)
Document 1: What problems and concerns are there in making up descriptive titles? (Similarity: 0.6359)
Document 4: How can actually pertinent data, as opposed to references or entire articles themselves, be retrieved automatically in response to information requests? (Similarity: 0.4132)


In [16]:
 documents=["What is information science?  Give definitions where possible.",
"What special training will ordinary researchers and businessmen need for proper information management and unobstructed use of information retrieval systems?",                  
"What problems are they likely to encounter?",
"Describe information retrieval and indexing in other languages.",
"What bearing does it have on the science in general?",
 " The use of abstract mathematics in information retrieval, e.g. group theory.",
   " What is the need for information consolidation, evaluation, and retrieval in scientific research?"]

In [17]:
query="information science retrieval"

In [18]:
vectorizer= TfidfVectorizer(stop_words='english')
X=vectorizer.fit_transform(documents)

In [19]:
n_components=3
svd= TruncatedSVD(n_components=n_components)
X_reduced=svd.fit_transform(X)


In [20]:
query_vec=vectorizer.transform([query])
query_reduced=svd.transform(query_vec)

In [21]:
cosine_similarities = cosine_similarity(query_reduced, X_reduced).flatten()

In [22]:
top_indices=cosine_similarities.argsort()[::-1]

In [24]:
print("Query:",query)
print("\n top matching documents:")
for idx in top_indices:
    print(f"Document  {idx+1}: {documents[idx]} (Similarity: {cosine_similarities[idx]:.4f})")

Query: information science 

 top matching documents:
Document  1: What is information science?  Give definitions where possible. (Similarity: 0.9898)
Document  5: What bearing does it have on the science in general? (Similarity: 0.8703)
Document  4: Describe information retrieval and indexing in other languages. (Similarity: 0.4656)
Document  2: What special training will ordinary researchers and businessmen need for proper information management and unobstructed use of information retrieval systems? (Similarity: 0.4235)
Document  7:  What is the need for information consolidation, evaluation, and retrieval in scientific research? (Similarity: 0.3860)
Document  6:  The use of abstract mathematics in information retrieval, e.g. group theory. (Similarity: 0.3860)
Document  3: What problems are they likely to encounter? (Similarity: -0.0000)


In [25]:
documents=["What methods are there for encoding, automatically matching, and automatically drawing structures extended in two dimensions,like the structural formulas for chemical compounds?",
"Techniques of machine matching and machine searching systems.Coding and matching methods.",
" Testing automated information systems.The need to provide personnel for the information field.",
 "Automated information in the medical field.Amount of use of books in libraries.",
"Relation to need for automated information systems ",
"Educational and training requirements for personnel in the information field.",
"Possibilities for this training.  Needs for programs providing this training International systems for exchange and dissemination of information.",
 "Cost and determination of cost associated with systems of automated information.",
"Computerized information retrieval systems.  Computerized indexing systems.",
"Computerized information systems in fields related to chemistry."]

In [26]:
query="information automated systems"

In [27]:
vectorizer= TfidfVectorizer(stop_words='english')
X=vectorizer.fit_transform(documents)

In [28]:
n_components=3
svd= TruncatedSVD(n_components=n_components)
X_reduced=svd.fit_transform(X)

In [29]:
query_vec=vectorizer.transform([query])
query_reduced=svd.transform(query_vec)

In [30]:
cosine_similarities = cosine_similarity(query_reduced, X_reduced).flatten()

In [31]:
top_indices=cosine_similarities.argsort()[::-1]

In [33]:
print("Query:",query)
print("\n top matching documents:")
for idx in top_indices:
    print(f"Document  {idx+1}: {documents[idx]} (Similarity: {cosine_similarities[idx]:.4f})")

Query: information automated systems

 top matching documents:
Document  8: Cost and determination of cost associated with systems of automated information. (Similarity: 0.9995)
Document  5: Relation to need for automated information systems  (Similarity: 0.9926)
Document  3:  Testing automated information systems.The need to provide personnel for the information field. (Similarity: 0.9259)
Document  7: Possibilities for this training.  Needs for programs providing this training International systems for exchange and dissemination of information. (Similarity: 0.9002)
Document  4: Automated information in the medical field.Amount of use of books in libraries. (Similarity: 0.7649)
Document  6: Educational and training requirements for personnel in the information field. (Similarity: 0.7040)
Document  9: Computerized information retrieval systems.  Computerized indexing systems. (Similarity: 0.6080)
Document  10: Computerized information systems in fields related to chemistry. (Similarity

In [7]:
import urllib.request
import tarfile
import os
import numpy as np
from sklearn. feature_extraction. text import TfidfVectorizer
from sklearn. decomposition import TruncatedSVD
from sklearn.metrics. pairwise import cosine_similarity

In [19]:
url = 'http://ir.dcs.gla.ac.uk/resources/test_collections/cisi/cisi.tar.gz'
dataset_path = 'cisi.tar.gz'

# Download the dataset
urllib.request.urlretrieve(url, dataset_path)

# Extract the tar.gz file
with tarfile.open(dataset_path, 'r:gz') as tar_ref:
    tar_ref.extractall('cisi')

# Remove the downloaded tar.gz file
os.remove(dataset_path)


  tar_ref.extractall('cisi')


In [9]:
docs = []
doc_path = 'cisi/CISI.ALL'

with open(doc_path, 'r') as f:
    content = f.read()
    documents = content. split(".I ")
    for doc in documents:
        if ".W" in doc:
            docs.append(doc.split(".W")[1].strip())

In [10]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer. fit_transform(docs)

In [11]:
svd = TruncatedSVD(n_components=100)
svd_matrix = svd. fit_transform(tfidf_matrix)

In [12]:
query = "information retrieval"

query_tfidf = vectorizer. transform( [query])
query_svd = svd.transform(query_tfidf)

cosine_similarities = cosine_similarity(query_svd, svd_matrix).flatten()
ranked_docs = np.argsort(-cosine_similarities)

In [13]:
for idx in ranked_docs:
    print(f"Document  {idx+1}: {documents[idx]} (Similarity: {cosine_similarities[idx]:.4f})")

Document  1171: 1170
.T
Some Aspects of Subject Acquisition and Detailed Subject Retrieval of Patent
Information
.A
Shenderov, V. Z.
.W
   Aspects of subject acquisition and retrieval of patent information are
discussed.. A patent-information service system is conventionally separated
into two parts:  a subject acquisitions system designed for stock acquisition
and search file building, and a detailed subject retrieval system designed to
deal with specific user requests.. The performance of both systems is analyzed
using patent classifications as examples.. The tasks of classification research
in relation to subject acquisitions system requirements are formulated..
.X
361	2	1170
759	2	1170
765	1	1170
1117	1	1170
1170	5	1170
1381	1	1170
1381	1	1170
 (Similarity: 0.7437)
Document  539: 538
.T
Information Retrieval Systems
.A
Lancaster, F.W.
.B
1972
.W
  This book is concerned primarily with those "intellectual" factors that
significantly affect the performance of all information retrieva