In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from collections import Counter
from sklearn.model_selection import train_test_split
import re

In [None]:
# Read the train and test data
train_path = "customer_service/main/train.csv"
test_path =  "customer_service/main/test.csv"

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

In [None]:
# Check the train data
print(df_train.head())

In [None]:
# Check the test data
print(df_test.head())

In [None]:
# Check the column names and number of rows and columns
df_train.info()

In [None]:
# Check the column names and number of rows and columns
df_test.info()

In [None]:
# Check for missing values in the train data
df_train.isnull().sum()

In [None]:
# Check for missing values in the test data
df_test.isnull().sum()

In [None]:
# Check for duplicated rows in the train data
df_train.duplicated().sum()

In [None]:
# Check for duplicated rows in the test data
df_test.duplicated().sum()

In [None]:
# Check for statistics in the train data
df_train.describe()

In [None]:
# Check for statistics in the test data
df_test.describe()

In [None]:
# Sentiment distribution in the training data
sns.countplot(data=df_train, x='customer_sentiment')
plt.title('Distribution of Customer Sentiment')
plt.show()

df_train['customer_sentiment'].value_counts(normalize=True)

In [None]:
# Sentiment distribution in the test data
sns.countplot(data=df_test, x='customer_sentiment')
plt.title('Distribution of Customer Sentiment')
plt.show()

df_test['customer_sentiment'].value_counts(normalize=True)

In [None]:
# Visualization of the length of the conversations in the train data
df_train['conversation_length'] = df_train['conversation'].apply(len)
sns.histplot(df_train['conversation_length'], bins=30, kde=True)
plt.title('Distribution of Conversation Lengths (Character Count)')
plt.show()

In [None]:
# Visualization of the length of the conversations in the test data
df_test['conversation_length'] = df_test['conversation'].apply(len)
sns.histplot(df_test['conversation_length'], bins=30, kde=True)
plt.title('Distribution of Conversation Lengths (Character Count)')
plt.show()

In [None]:
# Visualize distribution of other categorical features with respect to sentiment
categorical_columns = ['issue_area', 'product_category', 'issue_complexity', 'agent_experience_level']

for col in categorical_columns:
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df_train, x=col, hue='customer_sentiment')
    plt.title(f'Distribution of {col} by Sentiment')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# Function to perform chi-square test for each categorical column to see whether there are correlations between the categorical features and the customer_sentiment
def chi_square_test(col):
    contingency_table = pd.crosstab(df_train[col], df_train['customer_sentiment'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    return chi2, p

chi2_results = {}
for col in categorical_columns:
    chi2_results[col] = chi_square_test(col)

chi2_results

Considering the results, there seems to be a significant relationship between issue_area and customer_sentiment. There is an almost significant relation between agent_experience_level and customer_sentiment since the p_value is so close to 0.05. There seems no significant relation between customer_sentiment and product_category and issue_complexity.

In [None]:
# Finding the most frequent words across all conversations and their counts
text = " ".join(df_train['conversation'])
word_counts = Counter(text.split())
most_common_words = word_counts.most_common(15)
print("Most 15 frequent words:")
for word, count in most_common_words:
    print(f"{word}: {count}")

In [None]:
# Selecting the relevant columns for the sentiment-analysis task, splitting the train data into training and validation, and saving the new train-val-test data files.
df_train_final = df_train[['conversation', 'customer_sentiment']]
df_test_final = df_test[['conversation', 'customer_sentiment']]

X_train, X_val, y_train, y_val = train_test_split(
    df_train_final['conversation'], df_train_final['customer_sentiment'], 
    test_size=0.2, random_state=42, stratify=df_train['customer_sentiment'])

df_train_final = pd.DataFrame({'conversation': X_train, 'customer_sentiment': y_train})
df_val_final = pd.DataFrame({'conversation': X_val, 'customer_sentiment': y_val})

df_train_final.to_csv("customer_service/conversationOnly/train_final.csv", index=False)
df_val_final.to_csv("customer_service/conversationOnly/val_final.csv", index=False)
df_test_final.to_csv("customer_service/conversationOnly/test_final.csv", index=False)

In [None]:
df_train_final.loc[:, 'conversation_length'] = df_train_final['conversation'].apply(len)
df_val_final.loc[:, 'conversation_length'] = df_val_final['conversation'].apply(len)
df_test_final.loc[:, 'conversation_length'] = df_test_final['conversation'].apply(len)

# Visualization of the length of the conversations in the new train data
sns.histplot(df_train_final['conversation_length'], bins=30, kde=True)
plt.title('Distribution of Conversation Lengths in Train Data (Character Count)')
plt.show()

# Visualization of the length of the conversations in the validation data
sns.histplot(df_val_final['conversation_length'], bins=30, kde=True)
plt.title('Distribution of Conversation Lengths in Validation Data (Character Count)')
plt.show()

# Visualization of the length of the conversations in the test data
sns.histplot(df_test_final['conversation_length'], bins=30, kde=True)
plt.title('Distribution of Conversation Lengths in Test Data (Character Count)')
plt.show()


In [None]:
# Preprocessing data
def order_number_pattern(sentence):
    order_pattern = r'\bbb\d+\b'
    return re.search(order_pattern, sentence)

def preprocess_conversation(text):
    # Lowercase the entire text
    text = text.lower()
    
    # Remove any text within square brackets and parentheses
    text = re.sub(r'\[[^\]]*\]', '', text)
    text = re.sub(r'\([^)]*\)', '', text)
    
    # Split the conversation into lines
    lines = text.split("\n")
    
    # Keep only the lines that start with "customer:"
    customer_lines = [line for line in lines if line.strip().startswith("customer:")]
    
    # Remove empty lines and trim each line
    customer_lines = [line.strip() for line in customer_lines if line.strip()]
    
    # Delete "customer:" headings from the remaining lines
    customer_lines = [line.replace("customer: ", "") for line in customer_lines]
    
    # Delete sentences that contain order numbers
    customer_lines = [line for line in customer_lines if not order_number_pattern(line)]
    
    # Combine the processed lines back into a single string.
    return " ".join(customer_lines)

# Example
conversation = """Agent: Thank you for contacting BrownBox customer support. My name is John. How can I assist you today?

Customer: Hi John. I have been trying to order a refrigerator from your website, but it's not available for shipping to my location. Can you help me with this?

Agent: I'm sorry to hear that, sir. May I have your location, please?

Customer: Yes, I am in New York.

Agent: Thank you, sir. I apologize for the inconvenience, but due to some logistic issues, we are unable to ship refrigerators to New York currently. However, we have other products available for shipping to your location. May I suggest some alternatives?

Customer: No, I specifically want a refrigerator. This is unacceptable. Why can't you ship it to New York?

Agent: I understand your frustration, sir. Unfortunately, we are experiencing some issues with our shipping partners, which is causing a delay in delivering certain products to some locations. We are working to resolve this as soon as possible.

Customer: This is ridiculous. I need a refrigerator urgently. Can't you make an exception for me?

Agent: I'm sorry, sir, but we are unable to make an exception in this case. However, I can suggest some local stores in your area where you may be able to purchase a refrigerator.

Customer: I don't have time for that. This is a waste of my time. I want to speak to your manager.

Agent: I apologize for the inconvenience, sir. I will transfer your call to my manager, who will be able to assist you better. Please hold the line.

(Customer is on hold for a few minutes)

Manager: Hi, this is Mark. How can I assist you today?

Customer: Hi Mark. I am really disappointed with your service. I want to order a refrigerator, but your agent informed me that it's not available for shipping to my location.

Manager: I'm sorry to hear that, sir. May I have your location, please?

Customer: I am in New York.

Manager: Yes, sir. I understand the issue. As my colleague informed you, we are experiencing some issues with our shipping partners, which is causing a delay in delivering certain products to some locations. However, I can assure you that we are working to resolve this as soon as possible.

Customer: This is unacceptable. I need a refrigerator urgently.

Manager: I understand your urgency, sir. However, as of now, we are unable to ship refrigerators to your location. I can suggest some local stores in your area where you may be able to purchase a refrigerator.

Customer: I don't have time for that. This is a waste of my time. I am never going to order anything from your website again.

Manager: I apologize for the inconvenience, sir. I understand your frustration, but please know that we are doing everything we can to resolve this issue. Is there anything else I can assist you with?

Customer: No, that's all. Goodbye.

Manager: I'm sorry to hear that, sir. Thank you for your time. Goodbye.,negative"""
    
processed_text = preprocess_conversation(conversation)
print(processed_text)

This preprocessing approach focuses on cleaning and structuring the text to highlight relevant customer feedback while removing noise. It standardizes the content by lowercasing the text, which is a standard practice in NLP to reduce variations due to case sensitivity. The filtration ensures that the analysis remains focused on the customer's sentiments, which are the primary target for sentiment analysis. Removing text within square brackets and parentheses eliminates non-conversational elements (for example, agent's side interactions) that could distract from the emotional tone. Excluding order numbers removes irrelevant details that do not contribute to sentiment, such as logistical information. In conclusion, this preprocessing method isolates the customer’s expressed emotions, making the sentiment signal clearer and enhancing the accuracy of the sentiment analysis.

In [None]:
# Applying preprocessing to new train-val-test files and saving them to another location

df_train_final = pd.read_csv("customer_service/conversationOnly/train_final.csv")
df_val_final = pd.read_csv("customer_service/conversationOnly/val_final.csv")
df_test_final = pd.read_csv("customer_service/conversationOnly/test_final.csv")

df_train_final['conversation'] = df_train_final['conversation'].apply(preprocess_conversation)
df_val_final['conversation'] = df_val_final['conversation'].apply(preprocess_conversation)
df_test_final['conversation'] = df_test_final['conversation'].apply(preprocess_conversation)

df_train_final.to_csv("customer_service/preprocessed/prep_train_final.csv", index=False)
df_val_final.to_csv("customer_service/preprocessed/prep_val_final.csv", index=False)
df_test_final.to_csv("customer_service/preprocessed/prep_test_final.csv", index=False)

df_val_final

In [None]:
df_train_final.loc[:, 'conversation_length'] = df_train_final['conversation'].apply(len)
df_val_final.loc[:, 'conversation_length'] = df_val_final['conversation'].apply(len)
df_test_final.loc[:, 'conversation_length'] = df_test_final['conversation'].apply(len)

# Visualization of the length of the conversations in the preprocessed train data
sns.histplot(df_train_final['conversation_length'], bins=30, kde=True)
plt.title('Distribution of Conversation Lengths in Train Data (Character Count)')
plt.show()

# Visualization of the length of the conversations in the preprocessed validation data
sns.histplot(df_val_final['conversation_length'], bins=30, kde=True)
plt.title('Distribution of Conversation Lengths in Validation Data (Character Count)')
plt.show()

# Visualization of the length of the conversations in the preprocessed test data
sns.histplot(df_test_final['conversation_length'], bins=30, kde=True)
plt.title('Distribution of Conversation Lengths in Test Data (Character Count)')
plt.show()

This approach seems to reduce most of the characters in all data files in the sense that it will significantly shorten the text sequences while preserving essential customer sentiment information. This reduction is particularly necessary for NanoGPT training and GPT-2 fine-tuning because these models have token length constraints, and excessive text could lead to inefficient training or truncation. By filtering out irrelevant content, such as agent responses, headings, and order numbers, the dataset becomes more focused on customer expressions of sentiment.