In [None]:
import spacy
import pandas as pd
from textblob import TextBlob
import re
import logging
import sys
from pathlib import Path

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [None]:
# Check if spaCy model is available
try:
    nlp = spacy.load("en_core_web_sm")
except OSError as e:
    logger.error("spaCy model 'en_core_web_sm' not found. Please install it using: python -m spacy download en_core_web_sm")
    sys.exit(1)

In [None]:
# Define sample dataset of customer support tickets
tickets = [
    {"ticket_id": 1, "description": "Ugh, fine. You need a return label for that stupid product? Here it is. Just take it"},
    {"ticket_id": 2, "description": "Thank you for the quick response. I'm satisfied with the service."},
    {"ticket_id": 3, "description": "The website is not loading properly. Please fix it ASAP."},
    {"ticket_id": 4, "description": "I need to return a product. Can you send me the return label?"},
    {"ticket_id": 5, "description": "Ugh, seriously? This is the best you could come up with?"}

]

In [None]:
# Define NLP processing functions
def clean_text(text):
    """
    Cleans input text by converting to lowercase, removing punctuation and stop words,
    and lemmatizing words using spaCy.
    """
    try:
        if not isinstance(text, str):
            logger.warning(f"Invalid text input: {text}. Returning empty string.")
            return ""
        doc = nlp(text.lower())
        cleaned = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
        return " ".join(cleaned)
    except Exception as e:
        logger.error(f"Error cleaning text: {str(e)}")
        return ""

In [None]:
def extract_entities(text):
    """
    Extracts named entities from text using spaCy's NER.
    Returns a list of tuples (entity text, entity label).
    """
    try:
        if not isinstance(text, str):
            logger.warning(f"Invalid text input for entity extraction: {text}. Returning empty list.")
            return []
        doc = nlp(text)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        return entities
    except Exception as e:
        logger.error(f"Error extracting entities: {str(e)}")
        return []

In [None]:
def get_sentiment(text):
    """
    Computes sentiment polarity of text using TextBlob.
    Returns a float between -1 (negative) and 1 (positive).
    """
    try:
        if not isinstance(text, str):
            logger.warning(f"Invalid text input for sentiment analysis: {text}. Returning 0.0.")
            return 0.0
        blob = TextBlob(text)
        return blob.sentiment.polarity
    except Exception as e:
        logger.error(f"Error computing sentiment: {str(e)}")
        return 0.0

In [None]:
def get_sentiment_label(score):
    """
    Converts sentiment score to a label (Positive, Negative, Neutral).
    """
    try:
        if score > 0:
            return "Positive"
        elif score < 0:
            return "Negative"
        else:
            return "Neutral"
    except Exception as e:
        logger.error(f"Error labeling sentiment: {str(e)}")
        return "Neutral"


In [None]:
def extract_order_number(text):
    """
    Extracts order numbers from text using a regular expression.
    Assumes order numbers are in the format # followed by digits.
    Returns the order number if found, otherwise None.
    """
    try:
        if not isinstance(text, str):
            logger.warning(f"Invalid text input for order number extraction: {text}. Returning None.")
            return None
        pattern = r"#(\d+)"
        match = re.search(pattern, text)
        return match.group(1) if match else None
    except Exception as e:
        logger.error(f"Error extracting order number: {str(e)}")
        return None

In [None]:
def main():
    """
    Main function to process the dataset and enhance text analytics data quality.
    """
    try:
        # Create a pandas DataFrame from the sample dataset
        logger.info("Creating DataFrame from sample dataset")
        df = pd.DataFrame(tickets)

        # Apply NLP functions to enhance the dataset
        logger.info("Applying NLP processing to dataset")
        df['cleaned_description'] = df['description'].apply(clean_text)
        df['entities'] = df['description'].apply(extract_entities)
        df['sentiment_score'] = df['description'].apply(get_sentiment)
        df['sentiment_label'] = df['sentiment_score'].apply(get_sentiment_label)
        df['order_number'] = df['description'].apply(extract_order_number)

        # Display the enriched dataset
        logger.info("Displaying enriched dataset")
        print("\nEnriched Dataset:")
        print(df)

        # Save the enriched dataset to a CSV file
        output_path = Path("enriched_tickets.csv")
        logger.info(f"Saving enriched dataset to {output_path}")
        df.to_csv(output_path, index=False)
        logger.info(f"Dataset successfully saved to {output_path}")

    except Exception as e:
        logger.error(f"Error in main processing: {str(e)}")
        sys.exit(1)

if __name__ == "__main__":
    main()


Enriched Dataset:
   ticket_id                                        description  \
0          1  Ugh, fine. You need a return label for that st...   
1          2  Thank you for the quick response. I'm satisfie...   
2          3  The website is not loading properly. Please fi...   
3          4  I need to return a product. Can you send me th...   
4          5  Ugh, seriously? This is the best you could com...   

                         cleaned_description       entities  sentiment_score  \
0  ugh fine need return label stupid product             []        -0.191667   
1     thank quick response satisfied service             []         0.416667   
2             website load properly fix asap  [(ASAP, ORG)]         0.000000   
3      need return product send return label             []         0.000000   
4                    ugh seriously good come             []         0.333333   

  sentiment_label order_number  
0        Negative         None  
1        Positive         None 