In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# Ensure nltk resources are available
nltk.download('punkt')

def get_all_aspects(data):
    aspect_dict = {}
    for _, row in data.iterrows():
        sentence = row['Review Sentence']
        aspect = row['Aspect term']
        if sentence not in aspect_dict:
            aspect_dict[sentence] = []
        aspect_dict[sentence].append(aspect)
    return aspect_dict

def nltk_format_data(row, all_aspects):
    # Tokenize the sentence while correctly handling punctuation
    tokens = word_tokenize(row['Review Sentence'])
    current_aspect = row['Aspect term']
    emotion_code = {'Positive': 2, 'Neutral': 0, 'Negative': 1}[row['Simplified Emotion']]
    
    # Get all aspects for the current sentence
    all_aspects_in_sentence = all_aspects[row['Review Sentence']]
    
    # Normalize the tokens to ensure consistent matching with aspect terms
    normalized_tokens = [token.rstrip('.,?!:;') for token in tokens]
    
    formatted_sentence = []
    i = 0
    while i < len(tokens):
        normalized_token = normalized_tokens[i]
        matched_aspect = None
        
        # Check if the token is part of any aspect term
        for aspect in all_aspects_in_sentence:
            aspect_tokens = aspect.split()
            aspect_length = len(aspect_tokens)
            if normalized_tokens[i:i+aspect_length] == aspect_tokens:
                matched_aspect = aspect
                break
        
        if matched_aspect:
            aspect_tokens = matched_aspect.split()
            for j, aspect_token in enumerate(aspect_tokens):
                if j == 0:
                    formatted_sentence.append(f"{aspect_token} B-ASP {emotion_code if matched_aspect == current_aspect else '-1'}")
                else:
                    formatted_sentence.append(f"{aspect_token} I-ASP {emotion_code if matched_aspect == current_aspect else '-1'}")
            i += len(aspect_tokens)  # Skip the aspect tokens
        else:
            formatted_sentence.append(f"{tokens[i]} O -1")
            i += 1

    return "\n".join(formatted_sentence)

# Sample use
data = pd.DataFrame({
    'Review Sentence': ['The food was great but the service was slow.', 'The food was great but the service was slow.'],
    'Aspect term': ['food', 'service'],
    'Simplified Emotion': ['Positive', 'Negative']
})

# Get all aspects for each sentence
all_aspects = get_all_aspects(data)

# Apply NLTK formatting to the DataFrame and display the results
example_data_nltk_formatted = data.apply(nltk_format_data, axis=1, all_aspects=all_aspects)
print(example_data_nltk_formatted.values[0])  # Displaying formatted data for the first entry
print(example_data_nltk_formatted.values[1])  # Displaying formatted data for the second entry
