In [1]:
# === Create Example Text Files for Single-Pass Clustering ===

texts = {
    "file1.txt": "Machine learning is a branch of artificial intelligence.",
    "file2.txt": "Artificial intelligence and machine learning are related fields.",
    "file3.txt": "Football is the most popular sport in the world.",
    "file4.txt": "Cricket and football are played all over the world.",
    "file5.txt": "Python is used for data science and machine learning."
}

for filename, content in texts.items():
    with open(filename, "w", encoding="utf-8") as f:
        f.write(content)
        print(f"{filename} created successfully.")

print("\nAll 5 files created in current directory ✅")


file1.txt created successfully.
file2.txt created successfully.
file3.txt created successfully.
file4.txt created successfully.
file5.txt created successfully.

All 5 files created in current directory ✅


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
filenames = ["file1.txt", "file2.txt", "file3.txt", "file4.txt", "file5.txt"]


In [7]:
documents = [open(f, "r", encoding="utf-8").read() for f in filenames]


In [9]:
vectorizer = TfidfVectorizer(stop_words="english")
tfidf = vectorizer.fit_transform(documents)

In [11]:
THRESHOLD = 0.3


In [13]:
clusters, centroids = [], []

for i, vec in enumerate(tfidf):
    assigned = False
    for j, c in enumerate(centroids):
        sim = cosine_similarity(vec, c)[0][0]
        if sim >= THRESHOLD:
            clusters[j].append(filenames[i])
            centroids[j] = (centroids[j] + vec) / 2
            assigned = True
            break
    if not assigned:
        clusters.append([filenames[i]])
        centroids.append(vec)

In [15]:
for i, c in enumerate(clusters, 1):
    print(f"Cluster {i}: {c}")

Cluster 1: ['file1.txt', 'file2.txt']
Cluster 2: ['file3.txt', 'file4.txt']
Cluster 3: ['file5.txt']
