In [21]:
import numpy as np
import pandas as pd
import spacy
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import nltk
!pip install sentence-transformers
!pip install spacy
!pip install nltk


# Download NLTK resources (if not already done)
nltk.download('stopwords')
nltk.download('wordnet')

# Load Greek SpaCy model (Ensure this model is installed)
# Make sure you have downloaded the Greek model: `!python -m spacy download el_core_news_sm`
nlp = spacy.load("el_core_news_sm")

# Define custom stopwords (English, Greek, and any additional custom ones)
english_stopwords = set(stopwords.words('english'))  # NLTK English stopwords
greek_stopwords = set(stopwords.words('greek'))  # NLTK Greek stopwords

# Add your own custom stopwords
custom_stopwords = {"θέλω", "θελω", "ο", "to", "είμαι", 'ειμαι', "επιθυμώ", "να", "για", "μου", "και", "πώς", "κάνω", "έχω", "μπορώ", 'ένας'}

# **Step 1: Tokenization + Lemmatization + Stopword Removal Function**
def preprocess_and_lemmatize(text):
    """
    This function performs the following:
    1. Tokenization: Splits the text into words (tokens).
    2. Lemmatization: Converts each word into its base form.
    3. Stopword Removal: Removes predefined stopwords (English, Greek, and custom stopwords).
    """

    # Step 1: Convert text to lowercase
    text = text.lower()

    # Step 2: Tokenize the text using SpaCy (splitting the sentence into individual tokens/words)
    doc = nlp(text)
    print(f"Original Tokens: {[token.text for token in doc]}")  # Print the original tokens

    # Step 3: Lemmatization - Convert each word to its base form (lemma)
    lemmatized_tokens = [token.lemma_ for token in doc]
    print(f"Lemmatized Tokens: {lemmatized_tokens}")  # Print the lemmatized tokens

    # Step 4: Exclude stopwords (English, Greek, and custom)
    # Printing stopwords used
    print(f"English Stopwords: {english_stopwords}")
    print(f"Greek Stopwords: {greek_stopwords}")
    print(f"Custom Stopwords: {custom_stopwords}")

    # Remove stopwords based on the lemmatized tokens
    filtered_tokens = [token for token in lemmatized_tokens if token not in english_stopwords
                       and token not in greek_stopwords and token not in custom_stopwords]

    print(f"Filtered Tokens (No Stopwords): {filtered_tokens}")  # Print tokens after stopwords are removed

    # Join the remaining words back into a sentence
    cleaned_text = ' '.join(filtered_tokens)

    return cleaned_text

# **Step 2: Example dataset**
# You can replace this with your own dataset (this is just a sample)
file_path = 'C:/Users/Katerina/Downloads/Customer Utterances.csv'
data = pd.read_csv(file_path)

# Create DataFrame from the sample dataset
df = pd.DataFrame(data)

# **Step 3: Apply Tokenization, Lemmatization, and Stopword Removal to the Entire Dataset**
# Apply the `preprocess_and_lemmatize` function to each row in the 'Utterance' column
df['Processed'] = df['Utterance'].apply(preprocess_and_lemmatize)

# **Step 4: Display the Processed DataFrame (After Preprocessing)**
print("Processed Data (After Lemmatization and Stopword Removal):")
print(df[['Utterance', 'Processed']])

# **Step 5: Sentence Embeddings using Sentence-BERT**
# Initialize the Sentence-BERT model (multilingual model that works for both English and Greek)
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

# Generate embeddings for each processed sentence
embeddings = model.encode(df['Processed'].tolist())

# **Step 6: Clustering using KMeans (Example: Finding Optimal Clusters)**
# We will use KMeans clustering to group similar queries together.

# **Step 6a: Elbow Method to Find Optimal Number of Clusters**
inertia = []  # Store inertia values for the Elbow method
for k in range(1, 15):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(embeddings)
    inertia.append(kmeans.inertia_)

# Plot the Elbow Curve to find the optimal k
plt.plot(range(1, 15), inertia, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method')
plt.show()

# **Step 6b: Silhouette Score for Cluster Evaluation**
silhouette_scores = []
for k in range(2, 15):  # Start from 2 clusters (as 1 cluster isn't useful for silhouette score)
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(embeddings)
    score = silhouette_score(embeddings, kmeans.labels_)
    silhouette_scores.append(score)

# Plot Silhouette Scores to evaluate clustering quality
plt.plot(range(2, 15), silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Method')
plt.show()

# **Step 7: Perform KMeans Clustering (Choose the optimal number of clusters)**
num_clusters = 4  # Based on your elbow/silhouette analysis
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
kmeans.fit(embeddings)

# Add the cluster labels to the DataFrame
df['Cluster'] = kmeans.labels_

# **Step 8: Mapping Clusters to Intents (Example)**

# Define a simple manual mapping for cluster labels to intent labels
cluster_labels_mapping = {
    0: "Payment Inquiry",
    1: "Account Information",
    2: "Data Usage Inquiry",
    3: "Service Cancellation"
}

# Print the cluster labels and their corresponding intents
for cluster, label in cluster_labels_mapping.items():
    print(f"Cluster {cluster}: {label}")

# **Step 9: Test the Intent Prediction Function**
def predict_intent(query):
    # Preprocess and lemmatize the query
    processed_query = preprocess_and_lemmatize(query)

    # Generate the embedding for the query
    query_embedding = model.encode([processed_query])

    # Predict the cluster for the query
    cluster = kmeans.predict(query_embedding)[0]

    # Return the intent label based on the predicted cluster
    return cluster_labels_mapping.get(cluster, 'Unknown Intent')

# Test the intent prediction function
while True:
    query = input("Enter your query (or type 'exit' to quit): ")
    if query.lower() == 'exit':
        break
    intent = predict_intent(query)
    print(f"Predicted Intent: {intent}")


ModuleNotFoundError: No module named 'spacy'