In [1]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import spacy
import nltk
import re
import string
import pandas as pd
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from nltk.corpus import stopwords
from transformers import pipeline

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
def preprocess_text(text):
    """Basic text preprocessing: remove punctuation, stopwords, and tokenize."""
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Tokenization using spaCy
    doc = nlp(text.lower())
    # Remove stopwords and non-alphabetical tokens
    tokens = [token.text for token in doc if token.text not in stop_words and token.is_alpha]
    return ' '.join(tokens)

def clean_data(data):
    """Apply preprocessing to all texts in the dataset."""
    return [preprocess_text(text) for text in data]


In [4]:
# Load pre-trained zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def classify_intent(text, candidate_labels):
    """Classify the intent of the given text."""
    result = classifier(text, candidate_labels)
    return result['labels'][0]  # Return the top predicted label


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [5]:
def topic_modeling(texts, num_topics=5):
    """Perform Latent Dirichlet Allocation (LDA) topic modeling."""
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(texts)

    lda = LDA(n_components=num_topics, random_state=42)
    lda.fit(tfidf)

    # Display the top words for each topic
    feature_names = tfidf_vectorizer.get_feature_names_out()
    topics = {}
    for idx, topic in enumerate(lda.components_):
        topic_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
        topics[f"Topic {idx+1}"] = topic_words
    return topics


In [6]:
def generate_word_cloud(texts):
    """Generate a word cloud from the given text data."""
    text = ' '.join(texts)
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    wordcloud.to_file("wordcloud.png")  # Save the word cloud image
    return wordcloud


In [7]:
def replace_patterns(text, patterns_dict):
    """Replace certain patterns in the text with given replacements."""
    for pattern, replacement in patterns_dict.items():
        text = re.sub(pattern, replacement, text)
    return text


In [8]:
def extract_sales_insights(texts):
    """Analyze sales-related chat texts to extract insights."""
    sales_keywords = ['purchase', 'buy', 'price', 'cost', 'offer', 'deal']
    sales_texts = [text for text in texts if any(keyword in text for keyword in sales_keywords)]
    return sales_texts


In [9]:
def main():
    # Sample data
    sample_data = [
        "How much does the product cost? I want to make a purchase.",
        "Can I get a discount on the service?",
        "The service is not working. I need help with a refund.",
        "What are the shipping fees for this item?",
        "I have a complaint about my order."
    ]

    # Clean the data
    cleaned_data = clean_data(sample_data)

    # Classify intents
    candidate_labels = ["purchase", "refund", "support", "inquiry"]
    intents = [classify_intent(text, candidate_labels) for text in cleaned_data]
    print("Classified Intents:", intents)

    # Topic Modeling
    topics = topic_modeling(cleaned_data)
    print("Identified Topics:", topics)

    # Generate Word Cloud
    wordcloud = generate_word_cloud(cleaned_data)
    wordcloud.to_image().show()

    # Pattern replacement
    patterns = {
        "purchase": "buy",
        "cost": "price"
    }
    replaced_text = [replace_patterns(text, patterns) for text in cleaned_data]
    print("Text After Pattern Replacement:", replaced_text)

    # Extract Sales Insights
    sales_insights = extract_sales_insights(cleaned_data)
    print("Sales-related Insights:", sales_insights)

if __name__ == "__main__":
    main()


Classified Intents: ['purchase', 'refund', 'refund', 'purchase', 'inquiry']
Identified Topics: {'Topic 1': ['service'], 'Topic 2': ['service'], 'Topic 3': ['service'], 'Topic 4': ['service'], 'Topic 5': ['service']}
Text After Pattern Replacement: ['much product price want make buy', 'get discount service', 'service working need help refund', 'shipping fees item', 'complaint order']
Sales-related Insights: ['much product cost want make purchase']
