# Clustering  of documents

### Data Collection

In [4]:
!pip install python-docx


Collecting python-docx
  Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/239.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.6/239.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.0


In [5]:
from docx import Document

def read_docx(file_path):
    """ Read a .docx file and return its text content. """
    doc = Document(file_path)
    return " ".join([para.text for para in doc.paragraphs])

# Paths to the documents
doc1_path = '/content/Doc 1.docx'
doc2_path = '/content/Doc 2.docx'

# Reading the documents
doc1_content = read_docx(doc1_path)
doc2_content = read_docx(doc2_path)

len(doc1_content), len(doc2_content)  # Returning the length to check successful reading



(13, 13)

### Preprocessing

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from docx import Document

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [8]:
def read_docx(file_path):
    """ Read a .docx file and return its text content. """
    doc = Document(file_path)
    return " ".join([para.text for para in doc.paragraphs])

# Paths to the documents
doc1_path = '/content/Doc 1.docx'
doc2_path = '/content/Doc 2.docx'

# Reading the documents
doc1_content = read_docx(doc1_path)
doc2_content = read_docx(doc2_path)

def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)


In [9]:
# Creating a list of document contents
documents = [doc1_content, doc2_content]

# Preprocess documents
processed_documents = [preprocess_text(doc) for doc in documents]

### Embedding Generation

In [14]:
from transformers import AutoTokenizer, AutoModel

# Example: Using BERT model for embeddings
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    # Take the mean across the sequence length dimension to get a single vector per input document
    return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

# Generate embeddings for each document
embeddings = [get_embedding(doc) for doc in processed_documents]

# Checking the shape of the first embedding to ensure it's 1D
print(embeddings[0].shape)



(768,)


### Clustering

In [15]:
from sklearn.cluster import KMeans

# Define the number of clusters
n_clusters = 2  # Adjust based on your data and needs

# Apply K-Means clustering
kmeans = KMeans(n_clusters=n_clusters)
clusters = kmeans.fit_predict(embeddings)

# Output the cluster assignment for each document
print(clusters)


[0 1]




### Evaluation

LLM embeddings can be large and resource-intensive. Ensure that your environment has sufficient computational resources.
The choice of LLM (e.g., BERT, GPT) can be adjusted based on your specific needs and the nature of your documents.
This code provides a basic framework. Depending on your data and requirements, further customization may be needed.