In [5]:
import pandas as pd
import xml.etree.ElementTree as ET

# Define paths based on your structure
RAW_DIR = "Raw_Data"
PROCESSED_DIR = "Processed_Data"

# Ensure processed directory exists
os.makedirs(PROCESSED_DIR, exist_ok=True)

print(f"Directories initialized. Ready to process files from {RAW_DIR}")

Directories initialized. Ready to process files from Raw_Data


In [2]:
def process_semeval_2014(folder_name, output_filename):
    unified_rows = []
    global_idx = 1
    
    # Target folder: Raw_Data/res_2014
    target_path = os.path.join(RAW_DIR, folder_name)
    
    # We will look for all .xml files in that folder (test, train, val)
    files = [f for f in os.listdir(target_path) if f.endswith('.xml')]
    
    for file_name in files:
        file_path = os.path.join(target_path, file_name)
        print(f"Parsing: {file_name}")
        
        tree = ET.parse(file_path)
        root = tree.getroot()
        
        for sentence in root.findall('.//sentence'):
            text_node = sentence.find('text')
            if text_node is None or text_node.text is None:
                continue
                
            text = text_node.text.strip()
            aspect_terms = sentence.find('aspectTerms')
            
            # If the sentence has aspects, create a row for each
            if aspect_terms is not None:
                for at in aspect_terms.findall('aspectTerm'):
                    unified_rows.append({
                        "sentence_id": f"2014_{global_idx:04d}",
                        "sentence": text,
                        "aspect": at.get('term'),
                        "polarity": at.get('polarity').lower().strip(),
                        "from": int(at.get('from')),
                        "to": int(at.get('to'))
                    })
            
            # Increment the ID for every sentence processed
            global_idx += 1

    # Convert to DataFrame
    df = pd.DataFrame(unified_rows)
    
    # Save to Processed_Data
    final_path = os.path.join(PROCESSED_DIR, output_filename)
    df.to_csv(final_path, index=False, encoding='utf-8')
    
    return df

print("2014 Processor function defined.")

2014 Processor function defined.


In [3]:
# Run the processor for the res_2014 folder
df_2014 = process_semeval_2014("res_2014", "2014_rest_reviews.csv")

print(f"\nProcessing Complete!")
print(f"Total Rows Extracted: {len(df_2014)}")
print("-" * 30)

# Display the first 5 rows to check the 'Perfect Number' ID and formatting
df_2014.head()

Parsing: test.xml
Parsing: val.xml
Parsing: train.xml

Processing Complete!
Total Rows Extracted: 4923
------------------------------


Unnamed: 0,sentence_id,sentence,aspect,polarity,from,to
0,2014_0001,The bread is top notch as well.,bread,positive,4,9
1,2014_0002,I have to say they have one of the fastest del...,delivery times,positive,43,57
2,2014_0003,Food is always fresh and hot- ready to eat!,Food,positive,0,4
3,2014_0004,Did I mention that the coffee is OUTSTANDING?,coffee,positive,23,29
4,2014_0005,"Certainly not the best sushi in New York, howe...",sushi,conflict,23,28


In [4]:
import pandas as pd
import os

# Path to the file we just created
file_path = os.path.join("Processed_Data", "2014_rest_reviews.csv")

if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    
    total_rows = len(df)
    unique_ids = df['sentence_id'].nunique()
    multi_aspect_count = total_rows - unique_ids
    
    print("--- 2014 Data Verification ---")
    print(f"Total rows (Aspect-level): {total_rows}")
    print(f"Unique sentence IDs:       {unique_ids}")
    print(f"Sentences with >1 aspect:  {multi_aspect_count}")
    
    # Calculate average aspects per sentence for analysis
    avg_aspects = total_rows / unique_ids
    print(f"Average aspects/sentence:  {avg_aspects:.2f}")
    
    # Verify that IDs are consistent with the text
    unique_text = df['sentence'].nunique()
    if unique_ids == unique_text:
        print("\n✅ Verification Passed: Each unique ID corresponds to exactly one unique sentence.")
    else:
        print("\n⚠️ Note: Unique IDs and unique text counts differ.")
        print("This usually happens if the exact same sentence appears twice in different files.")
else:
    print("Error: 2014_rest_reviews.csv not found in Processed_Data.")

--- 2014 Data Verification ---
Total rows (Aspect-level): 4923
Unique sentence IDs:       2681
Sentences with >1 aspect:  2242
Average aspects/sentence:  1.84

⚠️ Note: Unique IDs and unique text counts differ.
This usually happens if the exact same sentence appears twice in different files.


In [14]:
import json
import ast

def process_semeval_2015(folder_name, output_filename):
    unified_rows = []
    global_idx = 1
    target_path = os.path.join(RAW_DIR, folder_name)
    files = [f for f in os.listdir(target_path) if f.endswith('.jsonl')]
    
    for file_name in files:
        file_path = os.path.join(target_path, file_name)
        print(f"Reading: {file_name}")
        
        with open(file_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                if not line.strip(): continue
                item = json.loads(line)
                
                # Based on your structure: "input": ["Sentence Text"]
                if 'input' in item and isinstance(item['input'], list) and len(item['input']) > 0:
                    text = item['input'][0].strip()
                else:
                    continue

                # Get the output string
                output_str = item.get('output', '[]')
                try:
                    # Converts "[['Al Di La', 'rest general', 'positive']]" to list
                    opinions = ast.literal_eval(output_str)
                except:
                    opinions = []

                for op in opinions:
                    # Your format is [Aspect, Category, Polarity]
                    # We only need Aspect (op[0]) and Polarity (op[2])
                    aspect_term = op[0]
                    polarity = op[2] 
                    
                    if str(aspect_term).upper() == "NULL" or not aspect_term:
                        final_aspect, start_idx, end_idx = "[ASPECT]", 0, 0
                    else:
                        final_aspect = aspect_term
                        start_idx = text.find(aspect_term)
                        if start_idx == -1:
                            start_idx, end_idx = 0, 0
                        else:
                            end_idx = start_idx + len(aspect_term)
                    
                    unified_rows.append({
                        "sentence_id": f"2015_{global_idx:04d}",
                        "sentence": text,
                        "aspect": final_aspect,
                        "from": start_idx,
                        "to": end_idx,
                        "polarity": polarity.lower().strip()
                    })
                
                global_idx += 1

    df = pd.DataFrame(unified_rows)
    output_path = os.path.join(PROCESSED_DIR, output_filename)
    df.to_csv(output_path, index=False, encoding='utf-8')
    return df

print("2015 Processor fixed for your specific JSON structure.")

2015 Processor fixed for your specific JSON structure.


In [15]:
# Run the processor for the res_2015 folder
df_2015 = process_semeval_2015("res_2015", "2015_rest_reviews.csv")

print(f"\n2015 Processing Complete!")
print(f"Total Rows Extracted: {len(df_2015)}")
print("-" * 30)

# Display a sample including potential [ASPECT] placeholders
df_2015.head()

Reading: test.jsonl
Reading: train.jsonl
Reading: val.jsonl

2015 Processing Complete!
Total Rows Extracted: 2838
------------------------------


Unnamed: 0,sentence_id,sentence,aspect,from,to,polarity
0,2015_0001,Love Al Di La,Al Di La,5,13,positive
1,2015_0002,I recommend this place to everyone.,place,17,22,positive
2,2015_0003,Great food.,food,6,10,positive
3,2015_0004,One of my favorite places in Brooklyn.,[ASPECT],0,0,positive
4,2015_0005,"The pastas are incredible, the risottos (parti...",pastas,4,10,positive


In [11]:
# Check the first line of the 2015 test file
with open(os.path.join(RAW_DIR, "res_2015", "test.jsonl"), 'r') as f:
    first_line = f.readline()
    print("Actual JSON structure:", first_line)

Actual JSON structure: {"task_type": "generation", "dataset": "semeval-2015", "input": ["Love Al Di La"], "output": "[['Al Di La', 'restaurant general', 'positive']]", "situation": "none", "label": "", "extra": "", "instruction": "Task: Extracting aspect terms, aspect categories and their corresponding sentiment polarities. Input: A sentence. Output: A list of 3-tuples where each tuple contains the extracted aspect term , aspect category their corresponding sentiment polarity. Supplement: \"Null\" means that there is no occurrence in the sentence. Example:  Input: \"Delicate spices, onions, eggs and a kick-ass roti.\"  Output: [['spices', 'food quality', 'positive'], ['onions', 'food quality', 'positive'], ['eggs', 'food quality', 'positive'], ['roti', 'food quality', 'positive']] "}



In [16]:
# Cell 9: 2015 Data Verification
if 'df_2015' in locals():
    total_rows_2015 = len(df_2015)
    unique_ids_2015 = df_2015['sentence_id'].nunique()
    avg_aspects_2015 = total_rows_2015 / unique_ids_2015 if unique_ids_2015 > 0 else 0
    
    print("--- 2015 Data Verification ---")
    print(f"Total rows (Aspect-level): {total_rows_2015}")
    print(f"Unique sentence IDs:       {unique_ids_2015}")
    print(f"Average aspects/sentence:  {avg_aspects_2015:.2f}")
    
    # Check for multi-aspect sentences
    multi_aspect_2015 = total_rows_2015 - unique_ids_2015
    print(f"Sentences with >1 aspect:  {multi_aspect_2015}")
    
    # Check for [ASPECT] placeholders
    null_count = len(df_2015[df_2015['aspect'] == '[ASPECT]'])
    print(f"Implicit aspects ([ASPECT]): {null_count}")
else:
    print("df_2015 not found. Please run the processing cell first.")

--- 2015 Data Verification ---
Total rows (Aspect-level): 2838
Unique sentence IDs:       1929
Average aspects/sentence:  1.47
Sentences with >1 aspect:  909
Implicit aspects ([ASPECT]): 698


In [17]:
import xml.etree.ElementTree as ET
import pandas as pd
import os

def process_semeval_2016_sb1(folder_name, output_filename):
    unified_rows = []
    global_idx = 1
    
    target_path = os.path.join(RAW_DIR, folder_name)
    
    # NEW FILTER: Captures anything with "SB1" in the name, 
    # covering .xml, .xml.gold, and trial files.
    files = [f for f in os.listdir(target_path) if 'SB1' in f.upper()]
    
    for file_name in files:
        file_path = os.path.join(target_path, file_name)
        print(f"Reading 2016 File: {file_name}")
        
        try:
            tree = ET.parse(file_path)
            root = tree.getroot()
            
            # Use './/sentence' to find sentences even if they are deep in a <Review> tag
            for sentence in root.findall('.//sentence'):
                text_node = sentence.find('text')
                if text_node is None or text_node.text is None:
                    continue
                    
                text = text_node.text.strip()
                opinions = sentence.find('Opinions')
                
                if opinions is not None:
                    for op in opinions.findall('Opinion'):
                        target = op.get('target')
                        
                        # Handle NULL Aspects
                        if not target or str(target).upper() == "NULL":
                            final_aspect, start, end = "[ASPECT]", 0, 0
                        else:
                            final_aspect = target
                            start = int(op.get('from', 0))
                            end = int(op.get('to', 0))
                        
                        unified_rows.append({
                            "sentence_id": f"2016_{global_idx:04d}",
                            "sentence": text,
                            "aspect": final_aspect,
                            "from": start,
                            "to": end,
                            "polarity": op.get('polarity').lower().strip()
                        })
                global_idx += 1
        except Exception as e:
            print(f"Error skipping {file_name}: {e}")

    df = pd.DataFrame(unified_rows)
    output_path = os.path.join(PROCESSED_DIR, output_filename)
    df.to_csv(output_path, index=False)
    return df

print("Final 2016 Processor ready. It will capture Train, Test (.gold), and Trial files.")

Final 2016 Processor ready. It will capture Train, Test (.gold), and Trial files.


In [18]:
# Execute the call for 2016
df_2016 = process_semeval_2016_sb1("res_2016", "2016_rest_reviews.csv")

# Quick Verification
print(f"\nTotal rows captured for 2016: {len(df_2016)}")
df_2016.head()

Reading 2016 File: ABSA16_Restaurants_Train_SB1_v2.xml
Reading 2016 File: EN_REST_SB1_TEST.xml.gold

Total rows captured for 2016: 3366


Unnamed: 0,sentence_id,sentence,aspect,from,to,polarity
0,2016_0001,Judging from previous posts this used to be a ...,place,51,56,negative
1,2016_0002,"We, there were four of us, arrived at noon - t...",staff,75,80,negative
2,2016_0003,"They never brought us complimentary noodles, i...",[ASPECT],0,0,negative
3,2016_0004,The food was lousy - too sweet or too salty an...,food,4,8,negative
4,2016_0004,The food was lousy - too sweet or too salty an...,portions,52,60,negative


In [None]:
# Check if the 2016 dataframe exists in memory
if 'df_2016' in locals():
    total_rows_2016 = len(df_2016)
    unique_ids_2016 = df_2016['sentence_id'].nunique()
    avg_aspects_2016 = total_rows_2016 / unique_ids_2016 if unique_ids_2016 > 0 else 0
    
    print("--- 2016 Data Verification ---")
    print(f"Total Aspect-Level Rows: {total_rows_2016}")
    print(f"Unique Sentence IDs:      {unique_ids_2016}")
    print(f"Average Aspects/Sentence: {avg_aspects_2016:.2f}")
    
    # Check for multi-aspect density
    multi_aspect_2016 = total_rows_2016 - unique_ids_2016
    print(f"Sentences with >1 aspect: {multi_aspect_2016}")
    
    # Implicit vs Explicit check
    implicit_count = len(df_2016[df_2016['aspect'] == '[ASPECT]'])
    explicit_count = total_rows_2016 - implicit_count
    print(f"Explicit Aspects:         {explicit_count}")
    print(f"Implicit ([ASPECT]):      {implicit_count}")
    
   
else:
    print("Error: df_2016 not found. Please run the 2016 processing cell first.")

--- 2016 Data Verification ---
Total Aspect-Level Rows: 3366
Unique Sentence IDs:      2295
Average Aspects/Sentence: 1.47
Sentences with >1 aspect: 1071
Explicit Aspects:         2530
Implicit ([ASPECT]):      836

--- Sentiment Distribution (2016) ---
polarity
positive    2268
negative     953
neutral      145
Name: count, dtype: int64
