In [1]:
# === Create Example Text Files for Single-Pass Clustering ===

texts = {
    "file1.txt": "Machine learning is a branch of artificial intelligence.",
    "file2.txt": "Artificial intelligence and machine learning are related fields.",
    "file3.txt": "Football is the most popular sport in the world.",
    "file4.txt": "Cricket and football are played all over the world.",
    "file5.txt": "Python is used for data science and machine learning."
}

for filename, content in texts.items():
    with open(filename, "w", encoding="utf-8") as f:
        f.write(content)
        print(f"{filename} created successfully.")

print("\nAll 5 files created in current directory ✅")


file1.txt created successfully.
file2.txt created successfully.
file3.txt created successfully.
file4.txt created successfully.
file5.txt created successfully.

All 5 files created in current directory ✅


In [2]:
# === Single-Pass Clustering Algorithm for Text Documents ===
# CO4 - Distributed and Multimedia Information Retrieval

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# --- Step 1: Mention the files manually ---
filenames = ["file1.txt", "file2.txt", "file3.txt", "file4.txt", "file5.txt"]

# --- Step 2: Read file contents ---
documents = [open(f, "r", encoding="utf-8").read() for f in filenames]

# --- Step 3: Convert to TF-IDF vectors ---
vectorizer = TfidfVectorizer(stop_words="english")
tfidf = vectorizer.fit_transform(documents)

# --- Step 4: Set similarity threshold ---
THRESHOLD = 0.3

# --- Step 5: Single-Pass Clustering ---
clusters, centroids = [], []

for i, vec in enumerate(tfidf):
    assigned = False
    for j, c in enumerate(centroids):
        sim = cosine_similarity(vec, c)[0][0]
        if sim >= THRESHOLD:
            clusters[j].append(filenames[i])
            centroids[j] = (centroids[j] + vec) / 2
            assigned = True
            break
    if not assigned:
        clusters.append([filenames[i]])
        centroids.append(vec)

# --- Step 6: Display results ---
for i, c in enumerate(clusters, 1):
    print(f"Cluster {i}: {c}")


Cluster 1: ['file1.txt', 'file2.txt']
Cluster 2: ['file3.txt', 'file4.txt']
Cluster 3: ['file5.txt']


In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer       # To convert text into TF-IDF vectors
from sklearn.metrics.pairwise import cosine_similarity             # To compute cosine similarity between documents

# --- Step 1: Mention the text files manually ---
filenames = ["file1.txt", "file2.txt", "file3.txt", "file4.txt", "file5.txt"]  # List of text files to cluster

# --- Step 2: Read file contents into a list ---
documents = [open(f, "r", encoding="utf-8").read() for f in filenames]         # Read each file's content

# --- Step 3: Convert the documents into TF-IDF vectors ---
vectorizer = TfidfVectorizer(stop_words="english")   # Create TF-IDF vectorizer (removes common English words)
tfidf = vectorizer.fit_transform(documents)          # Generate TF-IDF matrix (numeric representation of text)

# --- Step 4: Set similarity threshold ---
THRESHOLD = 0.4                                     # If similarity ≥ threshold, same cluster; else new cluster

# --- Step 5: Initialize empty lists for clusters and centroids ---
clusters, centroids = [], []                         # clusters -> list of lists; centroids -> average vectors

# --- Step 6: Single-Pass Clustering process ---
for i, vec in enumerate(tfidf):                      # For each document vector
    assigned = False                                 # Flag to check if assigned to any existing cluster

    for j, c in enumerate(centroids):                # Compare with each cluster centroid
        sim = cosine_similarity(vec, c)[0][0]        # Compute cosine similarity between document and centroid
        if sim >= THRESHOLD:                         # If similarity above threshold → same cluster
            clusters[j].append(filenames[i])          # Add this file to that cluster
            centroids[j] = (centroids[j] + vec) / 2   # Update cluster centroid (average of all members)
            assigned = True                           # Mark as assigned
            break                                     # Exit inner loop once assigned

    if not assigned:                                 # If not similar to any existing cluster
        clusters.append([filenames[i]])               # Create a new cluster with this file
        centroids.append(vec)                         # Set this file's vector as new cluster centroid

# --- Step 7: Display final clusters ---
for i, c in enumerate(clusters, 1):                  # Loop through all formed clusters
    print(f"Cluster {i}: {c}")                       # Print cluster number and the file names inside it
    
    
""" “Go through each document one by one,
and get both its index (i) and its TF-IDF vector (vec).”"""   

Cluster 1: ['file1.txt', 'file2.txt']
Cluster 2: ['file3.txt']
Cluster 3: ['file4.txt']
Cluster 4: ['file5.txt']


' “Go through each document one by one,\nand get both its index (i) and its TF-IDF vector (vec).”'