In [15]:
import pandas as pd
data = pd.read_csv("Annotated ABSA with Emotions Dataset.csv")

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4833 entries, 0 to 4832
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               2415 non-null   float64
 1   Review Sentence  4833 non-null   object 
 2   Aspect term      4832 non-null   object 
 3   polarity         4833 non-null   object 
 4   from             4833 non-null   int64  
 5   to               4833 non-null   int64  
 6   Anger            4831 non-null   float64
 7   Disgust          4832 non-null   float64
 8   Fear             4827 non-null   float64
 9   Joy              4816 non-null   float64
 10  Sadness          4831 non-null   float64
 11  Surprise         4824 non-null   float64
 12  Emotion Class    4833 non-null   object 
dtypes: float64(7), int64(2), object(4)
memory usage: 491.0+ KB


In [17]:
# drop the rows with missing values
data.dropna(subset=['Aspect term'], inplace=True)

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4832 entries, 0 to 4832
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               2414 non-null   float64
 1   Review Sentence  4832 non-null   object 
 2   Aspect term      4832 non-null   object 
 3   polarity         4832 non-null   object 
 4   from             4832 non-null   int64  
 5   to               4832 non-null   int64  
 6   Anger            4830 non-null   float64
 7   Disgust          4831 non-null   float64
 8   Fear             4826 non-null   float64
 9   Joy              4815 non-null   float64
 10  Sadness          4830 non-null   float64
 11  Surprise         4823 non-null   float64
 12  Emotion Class    4832 non-null   object 
dtypes: float64(7), int64(2), object(4)
memory usage: 528.5+ KB


In [19]:
# drop the Polarity  row with the value "conflict"
data = data[data['polarity'] != 'conflict']

In [20]:
data['polarity'].value_counts()

polarity
positive    2891
negative    1003
neutral      833
Name: count, dtype: int64

In [21]:

from nltk.tokenize import word_tokenize

# Ensure nltk resources are available
#nltk.download('punkt')

def get_all_aspects(data):
    aspect_dict = {}
    for _, row in data.iterrows():
        sentence = row['Review Sentence']
        aspect = row['Aspect term']
        if sentence not in aspect_dict:
            aspect_dict[sentence] = []
        aspect_dict[sentence].append(aspect)
    return aspect_dict
def nltk_format_data(row, all_aspects):
    # Tokenize the sentence while correctly handling punctuation
    tokens = word_tokenize(row['Review Sentence'])
    current_aspect = row['Aspect term']
    polarity_code = {'positive': 2, 'neutral': 0, 'negative': 1}[row['polarity']]
    
    # Map emotion classes to numerical codes
    emotion_code = {'Anger': 0, 'Disgust': 1, 'Fear': 2, 'Joy': 3, 'Sadness': 4, 'Surprise': 5} 
    
    # Get all aspects for the current sentence
    all_aspects_in_sentence = all_aspects[row['Review Sentence']]
    
    # Normalize the tokens to ensure consistent matching with aspect terms
    normalized_tokens = [token.rstrip('.,?!:;') for token in tokens]
    
    formatted_sentence = []
    i = 0
    while i < len(tokens):
        normalized_token = normalized_tokens[i]
        matched_aspect = None
        
        # Check if the token is part of any aspect term
        for aspect in all_aspects_in_sentence:
            aspect_tokens = aspect.split()
            aspect_length = len(aspect_tokens)
            if normalized_tokens[i:i+aspect_length] == aspect_tokens:
                matched_aspect = aspect
                break
        
        if matched_aspect:
            aspect_tokens = matched_aspect.split()
            for j, aspect_token in enumerate(aspect_tokens):
                if j == 0:
                    formatted_sentence.append(f"{aspect_token} B-ASP {polarity_code if matched_aspect == current_aspect else '-1'} {emotion_code[row['Emotion Class']] if matched_aspect == current_aspect else '-1'}")
                else:
                    formatted_sentence.append(f"{aspect_token} I-ASP {polarity_code if matched_aspect == current_aspect else '-1'} {emotion_code[row['Emotion Class']] if matched_aspect == current_aspect else '-1'}")
            i += len(aspect_tokens)  # Skip the aspect tokens
        else:
            formatted_sentence.append(f"{tokens[i]} O -1 -1")
            i += 1

    return '\n'.join(formatted_sentence)




# Get all aspects for each sentence
all_aspects = get_all_aspects(data)

# Apply NLTK formatting to the DataFrame and display the results
example_data_nltk_formatted = data.apply(nltk_format_data, axis=1, all_aspects=all_aspects)
print(example_data_nltk_formatted)  # Displaying formatted data for the first entry
# print(example_data_nltk_formatted.values[1])  # Displaying formatted data for 

0       But O -1 -1\nthe O -1 -1\nstaff B-ASP 1 0\nwas...
1       To O -1 -1\nbe O -1 -1\ncompletely O -1 -1\nfa...
2       The O -1 -1\nfood B-ASP 2 3\nis O -1 -1\nunifo...
3       The O -1 -1\nfood B-ASP -1 -1\nis O -1 -1\nuni...
4       The O -1 -1\nfood B-ASP -1 -1\nis O -1 -1\nuni...
                              ...                        
4828    Each O -1 -1\ntable B-ASP -1 -1\nhas O -1 -1\n...
4829    Each O -1 -1\ntable B-ASP -1 -1\nhas O -1 -1\n...
4830    Each O -1 -1\ntable B-ASP -1 -1\nhas O -1 -1\n...
4831    Each O -1 -1\ntable B-ASP -1 -1\nhas O -1 -1\n...
4832    Each O -1 -1\ntable B-ASP -1 -1\nhas O -1 -1\n...
Length: 4727, dtype: object


In [20]:
example_data_nltk_formatted.to_csv('example_data_nltk_formatted.dat', index=False)

In [22]:
import csv

# Calculate the number of rows for training and testing
train_size = int(0.75 * len(example_data_nltk_formatted))
test_size = len(example_data_nltk_formatted) - train_size

# Split the data into training and testing sets
train_df = example_data_nltk_formatted.iloc[:train_size]
test_df = example_data_nltk_formatted.iloc[train_size:]


# Add a new line at the end of each sentence
train_df = train_df.apply(lambda x: x + '\n' if isinstance(x, str) else x)
test_df = test_df.apply(lambda x: x + '\n' if isinstance(x, str) else x)

# Convert and save the training and testing data without quotes
train_df.to_csv('Restaurants.atepc.train.dat', index=False, header=False, sep='\t', quoting=csv.QUOTE_NONE,escapechar="\t")
test_df.to_csv('Restaurants.atepc.test.dat', index=False, header=False, sep='\t', quoting=csv.QUOTE_NONE,escapechar="\t")