In [1]:
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from gensim.models import Word2Vec

In [2]:
# read the data
df = pd.read_excel(r'C:\College\sem6\NLP\project\preprocessed_text.xlsx')

In [3]:
def compute_tfidf_matrix(docs):
    vectorizer = TfidfVectorizer()
    
    # Transform the input documents into a TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(docs)
    
    return tfidf_matrix

In [4]:
def compute_word_embeddings(docs, model):
    """
    Compute the average word embeddings for a list of documents using a pre-trained word embedding model.
    """
    
    embeddings = []
    
    for doc in docs:
        
        # Tokenize the document into individual words using NLTK
        words = nltk.word_tokenize(doc)
        
        # Extract word vectors from the pre-trained model for words that exist in the model's vocabulary. If valid vector then
        # compute the average embedding for the document. If not valid then append a zero vector of the model's vector size
        word_vectors = [model[word] for word in words if word in model]
        if word_vectors:
            embeddings.append(sum(word_vectors) / len(word_vectors))
        else:
            embeddings.append([0] * model.vector_size)
            
    return embeddings

In [5]:
def compute_cosine_similarity(matrix):
    """
    Compute the cosine similarity matrix for a given input matrix.
    """
    
    similarity_matrix = cosine_similarity(matrix, matrix)
    return similarity_matrix

In [6]:
def apply_clustering(matrix, num_clusters):
    """
    Applies KMeans clustering to the input matrix.
    """
    
    # Initialize KMeans
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    
    # predict the clusters for each data point
    clusters = kmeans.fit_predict(matrix)
    
    return clusters

In [7]:
def assign_categories(docs, clusters):
    
    # Create a DataFrame to store the document-text and assigned cluster
    results = pd.DataFrame({'Document': docs, 'Cluster': clusters}) 
    category_mapping = {0:'ham', 1:'promotional', 2:'educational', 3:'financial', 4:'job', 5:'account verification', 
                        6:'shopping', 7:'rate experience', 8:'miscellaneous'}
    
    # Map the clusters to their corresponding categories and create a new 'Category' column
    results['Category'] = results['Cluster'].map(category_mapping)
    
    return results

In [10]:
# define number of clusters
num_clusters = 9

# apply clustering and assign categories based on clusters
clusters = apply_clustering(similarity_matrix, num_clusters)
results = assign_categories(df['Text'], clusters)



In [16]:
results.to_excel("label_on_text.xlsx",index=False)