In [1]:
import numpy as np
import pandas as pd

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag
from nltk.chunk import RegexpParser
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/mac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mac/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
def chunk_by_syntax(text):
    sentences = sent_tokenize(text)
    grammar = r"""
      NP: {<DT>?<JJ>*<NN.*>}
      PP: {<IN><NP>}
      VP: {<VB.*><NP|PP|CLAUSE>+$}
      CLAUSE: {<NP><VP>}
    """
    chunker = RegexpParser(grammar)
    chunks = []
    current_chunk = []
    
    for sentence in sentences:
        words = word_tokenize(sentence)
        tagged = pos_tag(words)
        tree = chunker.parse(tagged)
        
        for subtree in tree.subtrees():
            if subtree.label() in ['NP', 'VP', 'PP']:
                current_chunk.append(" ".join([word for word, tag in subtree.leaves()]))
        
        chunks.append(" ".join(current_chunk))
        current_chunk = []
    
    return chunks

def chunk_by_semantic_similarity(text, threshold=0.7):
    sentences = sent_tokenize(text)
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(X)
    
    chunks = []
    current_chunk = [sentences[0]]
    
    for i in range(1, len(sentences)):
        sim = similarity_matrix[i-1][i]
        if sim < threshold:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentences[i]]
        else:
            current_chunk.append(sentences[i])
    
    chunks.append(" ".join(current_chunk))
    
    return chunks

def chunk_by_topic_modeling(list_sentences, n_topics=5, chunk_size=5):
    sentences = sent_tokenize(text)
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)
    lda = LDA(n_components=n_topics)
    lda.fit(X)
    
    topic_assignments = lda.transform(X).argmax(axis=1)
    chunks = []
    current_chunk = [sentences[0]]
    current_topic = topic_assignments[0]
    
    for i in range(1, len(sentences)):
        if topic_assignments[i] != current_topic or len(current_chunk) >= chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentences[i]]
            current_topic = topic_assignments[i]
        else:
            current_chunk.append(sentences[i])
    
    chunks.append(" ".join(current_chunk))
    
    return chunks

def optimal_contextual_chunking(text, method='semantic', **kwargs):
    if method == 'syntax':
        return chunk_by_syntax(text)
    elif method == 'semantic':
        return chunk_by_semantic_similarity(text, **kwargs)
    elif method == 'topic':
        return chunk_by_topic_modeling(text, **kwargs)
    elif method == 'auto':
        # Automatically determine best chunking strategy
        if len(text) < 500:  # Short conversation
            return chunk_by_syntax(text)
        elif len(text) < 1500:  # Medium conversation
            return chunk_by_semantic_similarity(text, threshold=kwargs.get('threshold', 0.7))
        else:  # Long conversation
            return chunk_by_topic_modeling(text, n_topics=kwargs.get('n_topics', 5), chunk_size=kwargs.get('chunk_size', 5))
    else:
        raise ValueError("Invalid method specified. Choose from 'syntax', 'semantic', 'topic', or 'auto'.")

In [4]:
conversation = """Agent: Good morning! Thank you for calling XYZ Network Services. My name is Sam. How can I assist you today?

Customer: Hi, Sam. I’m really frustrated right now. I’ve been having constant issues with my internet connection for the past week, and to top it off, my bill this month is way higher than usual. I’m not sure what’s going on, but I need this fixed immediately.

Agent: I’m really sorry to hear that you’ve been having such a tough time, especially with the connection and the billing. I can definitely help you out. Let’s start with the internet connection issue. Could you tell me more about the problems you’ve been experiencing?

Customer: The internet keeps dropping every few minutes. Sometimes it’s just slow, but other times, it disconnects entirely. I work from home, so this is a huge problem for me. I can’t afford to keep losing connection during important meetings.

Agent: I completely understand how disruptive that must be, especially when you’re relying on the internet for work. Let’s run through a few troubleshooting steps first. Have you noticed if the connection drops at specific times of the day, or is it random?

Customer: It seems pretty random, honestly. But it does feel like it’s worse in the evenings, which is when I’m usually trying to finish up work.

Agent: That could be due to network congestion, but we’ll check all possibilities. Let’s start by rebooting your router. Have you tried that recently?

Customer: I’ve rebooted it multiple times, but it only helps for a short while. Then the same thing happens again.

Agent: Thank you for trying that already. It sounds like there might be a deeper issue. I’m going to run a diagnostic on your connection from our end. This will take a minute or two. While that’s running, could you also check if all the cables are securely connected to your router and modem?

Customer: Sure, give me a second… (pause) Everything seems fine with the cables.

Agent: Thanks for checking. I’m seeing some irregularities on our end as well. It looks like there’s a signal issue that could be affecting your service. I’ll need to escalate this to our technical team to investigate further. They might need to send out a technician to check the lines outside your home. I can arrange that for you. What time would be convenient for a visit?

Customer: If they could come tomorrow morning, that would be great. I’m just so tired of dealing with this. It’s been affecting my work, and I’ve even had to use my mobile data as a backup, which is why my bill is so high this month!

Agent: I can imagine how frustrating that must be. I’ll schedule the technician for tomorrow morning between 9 and 11 AM. Regarding your bill, let’s take a closer look at that next. You mentioned it was higher than usual—did you see any unexpected charges on it?

Customer: Yeah, I noticed I was charged extra for data usage, which is ridiculous since I’m already paying for unlimited internet.

Agent: I see why that would be concerning. Let me pull up your billing details… (pause) It looks like the extra charges are indeed for mobile data usage. While your home internet plan is unlimited, mobile data incurs additional charges if you exceed your plan’s limit. However, since this was due to the service issues, I can offer you a credit for the extra charges. Does that sound acceptable?

Customer: Yes, that’s fair. But I still feel like I shouldn’t have to deal with these issues in the first place. And this isn’t the first time I’ve had problems with your service. I’m seriously considering switching providers.

Agent: I completely understand your frustration, and I’m sorry that our service hasn’t met your expectations. Your experience is very important to us, and we’d hate to lose you as a customer. If it’s alright with you, I can check if there are any upgrades or promotions available that might improve your service and save you some money.

Customer: Well, I’m open to hearing about what you have to offer, but honestly, I just need reliable internet.

Agent: Reliability is key, and we want to make sure you’re getting the best possible service. Let me check our current offers… (pause) We have an upgraded plan that includes faster internet speeds and a more advanced router, which could help with the connection stability. There’s also a discount available if you bundle this with your mobile plan. Would you like to hear more details?

Customer: Faster speeds sound good, but I’m concerned about the cost. I don’t want to end up paying more than I already am.

Agent: That’s a valid concern. The upgraded plan would be an additional $10 per month, but with the bundle discount, it would actually be $5 less than what you’re paying right now for both services. Plus, with the new router, we might be able to eliminate the connection issues you’ve been facing.

Customer: Okay, that sounds reasonable. But will this really fix the problem? I don’t want to end up in the same situation after upgrading.

Agent: I understand your hesitation. The upgraded router has better range and stability, which should make a noticeable difference. Also, with the technician coming out tomorrow, we’ll be able to address any underlying issues that might be affecting your current setup. If the problems persist even after the upgrade, we have a 30-day satisfaction guarantee, so you can switch back to your previous plan or explore other options without any penalty.

Customer: Alright, let’s go ahead with the upgrade then. I just hope this finally resolves everything.

Agent: I’m confident this will improve your experience, but I’ll also follow up with you personally after the technician’s visit to ensure everything is working as it should. I’ll process the upgrade now… (pause) The upgrade is complete, and you should see the new equipment delivered within the next two days. Is there anything else I can assist you with today?

Customer: Actually, yes. I’ve been with XYZ Network Services for a long time, but I never really looked into my contract details. Can you tell me if I’m locked into any specific term or if I’m free to cancel if things don’t improve?

Agent: Let me pull up your contract details… (pause) You’re currently on a month-to-month plan, which means you’re not locked into a long-term contract. You’re free to cancel at any time with no penalty. However, I hope the changes we’re making will improve your service so you don’t feel the need to switch.

Customer: That’s good to know. I really hope this works out because, to be honest, I don’t have the time or energy to go through the hassle of switching providers.

Agent: I completely understand, and we’ll do everything we can to ensure you don’t have to go through that. Just to recap, we’ve scheduled a technician for tomorrow morning, processed the upgrade to your plan, and applied a credit for the extra charges on your bill. Is there anything else on your mind that I can help with?

Customer: I think that covers everything for now. I’ll wait to see how the technician’s visit goes and how the new setup works out.

Agent: Great! I’ll follow up with you after the technician’s visit to make sure everything is resolved. If you have any more questions or concerns, don’t hesitate to reach out. Thank you for giving us the chance to make things right, and I hope you have a better experience moving forward.

Customer: Thanks, Sam. I appreciate your help. Have a good day.

Agent: You too! Take care."""


In [5]:
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag, ne_chunk
from collections import Counter

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /Users/mac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mac/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/mac/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /Users/mac/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [8]:
def chunk_by_semantic_similarity(sentences, threshold=0.7):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(X)
    
    chunks = []
    current_chunk = [sentences[0]]
    
    for i in range(1, len(sentences)):
        sim = similarity_matrix[i-1][i]
        if sim < threshold:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentences[i]]
        else:
            current_chunk.append(sentences[i])
    
    chunks.append(" ".join(current_chunk))
    
    return chunks

def chunk_by_noun_verb_ner(chunks):
    refined_chunks = []
    
    def extract_entities(chunk):
        words = word_tokenize(chunk)
        tagged = pos_tag(words)
        entities = [word for word, pos in tagged if pos.startswith('NN') or pos.startswith('VB')]
        return entities
    
    # Process each chunk and compare with the next chunk based on shared entities
    if chunks:
        current_chunk = chunks[0]
        current_entities = set(extract_entities(current_chunk))
    
        for i in range(1, len(chunks)):
            next_chunk = chunks[i]
            next_entities = set(extract_entities(next_chunk))
            
            # If there are shared entities, merge the chunks
#             print(current_entities.intersection(next_entities))
            if len(current_entities.intersection(next_entities))>6:
                current_chunk += " " + next_chunk
                current_entities.update(next_entities)
            else:
                refined_chunks.append(current_chunk)
                current_chunk = next_chunk
                current_entities = next_entities
        
        # Append the last chunk
        refined_chunks.append(current_chunk)
    
    return refined_chunks


def chunk_by_topic_modeling(chunks, n_topics=10, chunk_size=5):
    refined_chunks = []
    n_topics= int(len(chunks)*0.3)
    # Vectorize the chunks
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(chunks)
    
    # Apply LDA for topic modeling
    lda = LDA(n_components=n_topics)
    lda.fit(X)
    
    # Get topic assignments for each chunk
    topic_assignments = lda.transform(X).argmax(axis=1)
    
    current_chunk = [chunks[0]]
    current_topic = topic_assignments[0]
    
    for i in range(1, len(chunks)):
        if topic_assignments[i] != current_topic or len(current_chunk) >= chunk_size:
            refined_chunks.append(" ".join(current_chunk))
            current_chunk = [chunks[i]]
            current_topic = topic_assignments[i]
        else:
            current_chunk.append(chunks[i])
    
    # Append the last chunk
    refined_chunks.append(" ".join(current_chunk))
    
    return refined_chunks

def chunk_by_length(refined_chunks, min_length=100, max_length=600):
    final_chunks = []
    current_chunk = ""

    for chunk in refined_chunks:
        if len(chunk) > max_length:
            if current_chunk:
                final_chunks.append(current_chunk)
                current_chunk = ""

            # Split large chunks directly into smaller pieces
            while len(chunk) > max_length:
                part = chunk[:max_length]
                final_chunks.append(part)
                chunk = chunk[max_length:]

            if len(chunk) > 0:
                current_chunk = chunk
        elif len(chunk) + len(current_chunk) <= max_length:
            # Add to the current chunk if it won't exceed max_length
            current_chunk += " " + chunk if current_chunk else chunk
        else:
            # If the current chunk is too small, append it to the previous chunk
            if len(current_chunk) < min_length and final_chunks:
                final_chunks[-1] += " " + current_chunk
            else:
                final_chunks.append(current_chunk)
            current_chunk = chunk

    # Handle any remaining chunk
    if current_chunk:
        if len(current_chunk) < min_length and final_chunks:
            final_chunks[-1] += " " + current_chunk
        else:
            final_chunks.append(current_chunk)

    return final_chunks


def optimal_sequential_contextual_chunking(text):
    sentences = [x for x in conversation.split('\n') if len(x)>3]
    refined_chunks = chunk_by_noun_verb_ner(sentences)
    refined_chunks = chunk_by_topic_modeling(refined_chunks)
    refined_chunks = chunk_by_semantic_similarity(refined_chunks, threshold=0.6)
    final_chunks = chunk_by_length(refined_chunks)
    
    for i, chunk in enumerate(final_chunks):
        print(f"Chunk {i+1}:\n{chunk}\n")
    
    return final_chunks

In [None]:
sentences = [x for x in conversation.split('\n') if len(x)>3]
refined_chunks = chunk_by_noun_verb_ner(sentences)
refined_chunks = chunk_by_semantic_similarity(refined_chunks, threshold=0.8)
refined_chunks = chunk_by_topic_modeling(refined_chunks)
final_chunks = chunk_by_length(refined_chunks)
for i, chunk in enumerate(final_chunks):
    print(f"Chunk {i+1}:\n{chunk}\n")

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/mac/nltk_data'
    - '/opt/anaconda3/nltk_data'
    - '/opt/anaconda3/share/nltk_data'
    - '/opt/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
