In [2]:
import pandas as pd
data = pd.read_csv("Annotated ABSA with Emotions Dataset.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4832 entries, 0 to 4831
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     2414 non-null   float64
 1   Review Sentence        4832 non-null   object 
 2   Aspect term            4832 non-null   object 
 3   polarity               4832 non-null   object 
 4   from                   4832 non-null   int64  
 5   to                     4832 non-null   int64  
 6   Anger                  4830 non-null   float64
 7   Disgust                4831 non-null   float64
 8   Fear                   4826 non-null   float64
 9   Joy                    4815 non-null   float64
 10  Sadness                4830 non-null   float64
 11  Surprise               4823 non-null   float64
 12  Emotion Class          4832 non-null   object 
 13  emotion context words  4671 non-null   object 
dtypes: float64(7), int64(2), object(5)
memory usage: 528.6+ 

In [4]:
def merge_emotions(emotion):
    if emotion in ['Anger', 'Disgust', 'Fear', 'Sadness']:
        return 'Anger'
    elif emotion == 'Joy':
        return 'Joy'
    else:  # 'Surprise'
        return 'Surprise'

# Apply the function to the 'Emotion Class' column
data['Emotion Class'] = data['Emotion Class'].apply(merge_emotions)

In [5]:
data['Emotion Class'].value_counts()

Emotion Class
Joy         3132
Anger       1501
Surprise     199
Name: count, dtype: int64

In [6]:
# drop the Polarity  row with the value "conflict"
data = data[data['polarity'] != 'conflict']

In [7]:
data['polarity'].value_counts()

polarity
positive    2891
negative    1003
neutral      833
Name: count, dtype: int64

In [8]:

def get_all_aspects(data):
    aspect_dict = {}
    for _, row in data.iterrows():
        sentence = row['Review Sentence']
        aspect = row['Aspect term']
        if sentence not in aspect_dict:
            aspect_dict[sentence] = []
        aspect_dict[sentence].append(aspect)
    return aspect_dict


In [9]:
import string
list(string.digits)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [10]:
data

Unnamed: 0,id,Review Sentence,Aspect term,polarity,from,to,Anger,Disgust,Fear,Joy,Sadness,Surprise,Emotion Class,emotion context words
0,3121.0,But the staff was so horrible to us.,staff,negative,8,13,4.0,1.0,1.0,0.0,3.0,1.0,Anger,horrible
1,2777.0,"To be completely fair, the only redeeming fact...",food,positive,57,61,0.0,0.0,0.0,2.0,0.0,0.0,Joy,which was above average
2,1634.0,"The food is uniformly exceptional, with a very...",food,positive,4,8,0.0,0.0,0.0,4.0,0.0,0.0,Joy,"uniformly exceptional, very capable proudly"
3,1634.0,"The food is uniformly exceptional, with a very...",kitchen,positive,55,62,0.0,0.0,0.0,4.0,0.0,0.0,Joy,uniformly exceptional very capableproudly
4,1634.0,"The food is uniformly exceptional, with a very...",menu,neutral,141,145,0.0,0.0,0.0,4.0,0.0,0.0,Joy,uniformly exceptionalvery capableproudly
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4827,,Each table has a pot of boiling water sunken i...,pot of boiling water,neutral,17,37,2.0,2.0,0.0,1.0,0.0,0.0,Anger,sunken into its surface(1)you get platters of ...
4828,,Each table has a pot of boiling water sunken i...,meats,neutral,99,104,0.0,0.0,0.0,3.0,0.0,1.0,Joy,sunken into its surface(1)you get platters of ...
4829,,Each table has a pot of boiling water sunken i...,vegetables,neutral,114,124,0.0,0.0,0.0,2.0,0.0,1.0,Joy,sunken into its surface(1)you get platters of ...
4830,,Each table has a pot of boiling water sunken i...,rice,neutral,130,134,0.0,0.0,0.0,2.0,0.0,1.0,Joy,sunken into its surface(1)you get platters of ...


In [None]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_df, test_df = train_test_split(data, test_size=0.20, random_state=42,shuffle=True)

# Define the target variable for the training data
y_train = train_df['Emotion Class']

# Instantiate the RandomOverSampler
ros = RandomOverSampler()

# Resample the training dataset
X_resampled, y_resampled = ros.fit_resample(train_df, y_train)

# Now, 'X_resampled' is your DataFrame with balanced 'Emotion Class' for the training data


In [13]:
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords



# stop_words = set(stopwords.words('english'))
excluded_words = set(stopwords.words('english') + list(string.punctuation) + list(string.digits))


# new implementation
def nltk_format_data(row, all_aspects):
    # Replace NaN values with empty strings
    row = row.fillna('')
    
    # Remove numbers
    sentence = re.sub(r'\(\d+\)', '', row['Review Sentence'])

    # Tokenize the sentence while correctly handling punctuation
    tokens = word_tokenize(sentence)
    current_aspect = row['Aspect term']
    polarity_code = {'positive': 2, 'neutral': 0, 'negative': 1}[row['polarity']]

    # Map emotion classes to numerical codes
    # emotion_code = {'Anger': 0, 'Disgust': 1, 'Fear': 2, 'Joy': 3, 'Sadness': 4, 'Surprise': 5}
    emotion_code = {'Anger': 1, 'Joy': 2, 'Surprise': 0}
    
    # Get all aspects for the current sentence
    all_aspects_in_sentence = all_aspects[row['Review Sentence']]

    # Normalize the tokens to ensure consistent matching with aspect terms
    normalized_tokens = [token.rstrip('.,?!:;') for token in tokens]

    # Get the emotion words for the current sentence
    emotion_words = word_tokenize(row['emotion context words'])  
    emotion_words = [word for word in emotion_words if word not in excluded_words]

    formatted_sentence = []
    i = 0
    while i < len(tokens):
        normalized_token = normalized_tokens[i]
        matched_aspect = None

        # Check if the token is part of any aspect term
        for aspect in all_aspects_in_sentence:
            aspect_tokens = aspect.split()
            aspect_length = len(aspect_tokens)
            if normalized_tokens[i:i+aspect_length] == aspect_tokens:
                matched_aspect = aspect
                break

        if matched_aspect:
            aspect_tokens = matched_aspect.split()
            for j, aspect_token in enumerate(aspect_tokens):
                if j == 0:
                    formatted_sentence.append(f"{aspect_token} B-ASP {polarity_code if matched_aspect == current_aspect else '-1'} {emotion_code[row['Emotion Class']] if matched_aspect == current_aspect else '-1'}")
                else:
                    formatted_sentence.append(f"{aspect_token} I-ASP {polarity_code if matched_aspect == current_aspect else '-1'} {emotion_code[row['Emotion Class']] if matched_aspect == current_aspect else '-1'}")
            i += len(aspect_tokens)  # Skip the aspect tokens
        else:
            # Check if the token is part of the emotion words
            if normalized_token in emotion_words:
                formatted_sentence.append(f"{tokens[i]} O -1 {emotion_code[row['Emotion Class']]}")
            else:
                formatted_sentence.append(f"{tokens[i]} O -1 -1")
            i += 1

    return '\n'.join(formatted_sentence)



In [14]:
# Get all aspects for each sentence
all_aspects = get_all_aspects(data)

# Apply NLTK formatting to the DataFrame and display the results
example_data_nltk_formatted = data.apply(nltk_format_data, axis=1, all_aspects=all_aspects)
print(example_data_nltk_formatted)  # Displaying formatted data for the first entry
# print(example_data_nltk_formatted.values[1])  # Displaying formatted data for 

0       But O -1 -1\nthe O -1 -1\nstaff B-ASP 1 1\nwas...
1       To O -1 -1\nbe O -1 -1\ncompletely O -1 -1\nfa...
2       The O -1 -1\nfood B-ASP 2 2\nis O -1 -1\nunifo...
3       The O -1 -1\nfood B-ASP -1 -1\nis O -1 -1\nuni...
4       The O -1 -1\nfood B-ASP -1 -1\nis O -1 -1\nuni...
                              ...                        
4827    Each O -1 -1\ntable B-ASP -1 -1\nhas O -1 -1\n...
4828    Each O -1 -1\ntable B-ASP -1 -1\nhas O -1 -1\n...
4829    Each O -1 -1\ntable B-ASP -1 -1\nhas O -1 -1\n...
4830    Each O -1 -1\ntable B-ASP -1 -1\nhas O -1 -1\n...
4831    Each O -1 -1\ntable B-ASP -1 -1\nhas O -1 -1\n...
Length: 4727, dtype: object


In [51]:
len(X_resampled)

14916

In [11]:
import csv
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_df, test_df = train_test_split(example_data_nltk_formatted, test_size=0.30, random_state=42, shuffle=True)

# # Calculate the number of rows for training and testing
# train_size = int(0.80 * len(example_data_nltk_formatted))
# test_size = len(example_data_nltk_formatted) - train_size

# Split the data into training and testing sets
# train_df = example_data_nltk_formatted.iloc[:train_size]
# test_df = example_data_nltk_formatted.iloc[train_size:]


# Add a new line at the end of each sentence
train_df = train_df.apply(lambda x: x + '\n' if isinstance(x, str) else x)
test_df = test_df.apply(lambda x: x + '\n' if isinstance(x, str) else x)

# Convert and save the training and testing data without quotes
train_df.to_csv('Restaurants.atepc.train.dat', index=False, header=False, sep='\t', quoting=csv.QUOTE_NONE,escapechar="\t")
test_df.to_csv('Restaurants.atepc.test.dat', index=False, header=False, sep='\t', quoting=csv.QUOTE_NONE,escapechar="\t")


In [22]:



from nltk.tokenize import word_tokenize

# Ensure nltk resources are available
#nltk.download('punkt')

def get_all_aspects(data):
    aspect_dict = {}
    for _, row in data.iterrows():
        sentence = row['Review Sentence']
        aspect = row['Aspect term']
        if sentence not in aspect_dict:
            aspect_dict[sentence] = []
        aspect_dict[sentence].append(aspect)
    return aspect_dict
def nltk_format_data(row, all_aspects):
    # Tokenize the sentence while correctly handling punctuation
    tokens = word_tokenize(row['Review Sentence'])
    current_aspect = row['Aspect term']
    polarity_code = {'positive': 2, 'neutral': 0, 'negative': 1,'conflict':3}[row['polarity']]
    
    # Map emotion classes to numerical codes
    emotion_code = {'Anger': 0, 'Disgust': 1, 'Fear': 2, 'Joy': 3, 'Sadness': 4, 'Surprise': 5} 
    
    # Get all aspects for the current sentence
    all_aspects_in_sentence = all_aspects[row['Review Sentence']]
    
    # Normalize the tokens to ensure consistent matching with aspect terms
    normalized_tokens = [token.rstrip('.,?!:;') for token in tokens]
    
    formatted_sentence = []
    i = 0
    while i < len(tokens):
        normalized_token = normalized_tokens[i]
        matched_aspect = None
        
        # Check if the token is part of any aspect term
        for aspect in all_aspects_in_sentence:
            aspect_tokens = aspect.split()
            aspect_length = len(aspect_tokens)
            if normalized_tokens[i:i+aspect_length] == aspect_tokens:
                matched_aspect = aspect
                break
        
        if matched_aspect:
            aspect_tokens = matched_aspect.split()
            for j, aspect_token in enumerate(aspect_tokens):
                if j == 0:
                    formatted_sentence.append(f"{aspect_token} B-ASP {polarity_code if matched_aspect == current_aspect else '-1'} {emotion_code[row['Emotion Class']] if matched_aspect == current_aspect else '-1'}")
                else:
                    formatted_sentence.append(f"{aspect_token} I-ASP {polarity_code if matched_aspect == current_aspect else '-1'} {emotion_code[row['Emotion Class']] if matched_aspect == current_aspect else '-1'}")
            i += len(aspect_tokens)  # Skip the aspect tokens
        else:
            formatted_sentence.append(f"{tokens[i]} O -1 -1")
            i += 1

    return '\n'.join(formatted_sentence)




# Get all aspects for each sentence
all_aspects = get_all_aspects(data)

# Apply NLTK formatting to the DataFrame and display the results
example_data_nltk_formatted = data.apply(nltk_format_data, axis=1, all_aspects=all_aspects)
print(example_data_nltk_formatted)  # Displaying formatted data for the first entry
# print(example_data_nltk_formatted.values[1])  # Displaying formatted data for 

KeyError: 'Negative'

In [1]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


token_ids = [101, 2348, 2027, 2079, 1996, 5171, 2054, 2785, 1997, 2300, 2052, 2017, 2066, 3980, 1996, 2326, 2001, 2204, 1998, 3452, 2200, 19613, 2000, 2173, 102]

# Convert token IDs to tokens
tokens = tokenizer.convert_ids_to_tokens(token_ids)

# Join tokens to form the original text
original_text = tokenizer.convert_tokens_to_string(tokens)

print(original_text)

[CLS] although they do the typical what kind of water would you like questions the service was good and overall very relaxing to place [SEP]


In [4]:
print(tokenizer.encode("although they do the typical what kind of water would you like questions the service was good and overall very relaxing to place"))

[101, 2348, 2027, 2079, 1996, 5171, 2054, 2785, 1997, 2300, 2052, 2017, 2066, 3980, 1996, 2326, 2001, 2204, 1998, 3452, 2200, 19613, 2000, 2173, 102]
