# TF-IDF

## Text Preprocessing

In [8]:
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary resources from NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Read the CSV file into a DataFrame
dataset = pd.read_csv("customer_complaints_1.csv")

# Access the "text" column
text_column = dataset["text"]

# Function to preprocess text
def preprocess_text(text):
    # Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove numbers
    text = ''.join([i for i in text if not i.isdigit()])
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

# Apply preprocessing to each entry in the "text_column"
preprocessed_text_column = text_column.apply(preprocess_text)

# Print the first few preprocessed entries
print(preprocessed_text_column.head())

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ikmalkamil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ikmalkamil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ikmalkamil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    [used, love, comcast, constant, update, intern...
1    [im, comcast, worst, internet, provider, im, t...
2    [could, give, negative, star, star, review, wo...
3    [ive, worst, experience, far, since, install, ...
4    [check, contract, sign, comcast, advertised, o...
Name: text, dtype: object


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert preprocessed_text_column back to text format
preprocessed_text = preprocessed_text_column.apply(lambda x: ' '.join(x))

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed text
X = vectorizer.fit_transform(preprocessed_text)

# X is now your TF-IDF matrix

In [20]:
from sklearn.cluster import KMeans
from tabulate import tabulate

# Define the number of clusters
k = 2

# Initialize KMeans with the number of clusters
km = KMeans(n_clusters=k)

# Fit KMeans to the TF-IDF matrix
km.fit(X)

# Predict the clusters for each document
y_pred = km.predict(X)

# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(preprocessed_text, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
    print()


Document                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [21]:
from collections import Counter

# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.8421052631578947
