### Converter

In [None]:
import json
import os
import random

def convert_label_studio_to_pyabsa(input_file, output_dir):
    """
    Convert Label Studio export format to PyABSA EMCGCN format.
    """
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Read the Label Studio export file
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    converted_data = []
    
    for item in data:
        # Skip if no annotations
        if not item.get('annotations'):
            continue
        
        # Get the text from the item
        text = item.get('data', {}).get('text', '')
        
        # If text is not found in the expected location, try to find it elsewhere
        if not text and 'text' in item:
            text = item['text']
        
        # Skip if no text
        if not text:
            print(f"Warning: No text found for item {item['id']}")
            continue
        
        for annotation in item['annotations']:
            aspects = []
            opinions = []
            relations = []
            
            # Extract aspects, opinions, and relations from the annotation results
            for result in annotation['result']:
                if result['type'] == 'labels':
                    # Get the value data
                    value = result.get('value', {})
                    
                    # If there's text and labels
                    if value and 'text' in value and 'labels' in value:
                        # Calculate word indices
                        start_char = value['start']
                        end_char = value['end']
                        
                        # Extract words from the text and determine indices
                        words = text.split()
                        char_index = 0
                        word_indices = []
                        
                        for i, word in enumerate(words):
                            word_len = len(word)
                            # Check if this word overlaps with our span
                            if not (char_index + word_len <= start_char or char_index >= end_char):
                                word_indices.append(i)
                            char_index += word_len + 1  # +1 for the space
                        
                        if 'aspect' in value['labels']:
                            aspects.append({
                                'id': result['id'],
                                'indices': word_indices,
                                'text': value['text']
                            })
                        elif 'opinion' in value['labels']:
                            opinions.append({
                                'id': result['id'],
                                'indices': word_indices,
                                'text': value['text']
                            })
                
                elif result['type'] == 'relation':
                    from_id = result['from_id']
                    to_id = result['to_id']
                    labels = result.get('labels', [])
                    sentiment = labels[0] if labels else 'NEU'
                    relations.append({
                    'from_id': from_id,
                    'to_id': to_id,
                    'sentiment': sentiment
                })

            
            # Generate PyABSA format triplets
            triplets = []
            for relation in relations:
                aspect = next((a for a in aspects if a['id'] == relation['from_id']), None)
                opinion = next((o for o in opinions if o['id'] == relation['to_id']), None)
                
                if aspect and opinion:
                    triplets.append((aspect['indices'], opinion['indices'], relation['sentiment']))
            
            if triplets:
                # Create the PyABSA format string
                pyabsa_format = f"{text}####[{', '.join(str(t) for t in triplets)}]"
                converted_data.append(pyabsa_format)
    
    # Shuffle data for randomness
    random.shuffle(converted_data)
    
    # Split data into train, validation, and test sets (70/15/15 split)
    n = len(converted_data)
    train_size = int(0.7 * n)
    val_size = int(0.15 * n)
    
    train_data = converted_data[:train_size]
    valid_data = converted_data[train_size:train_size + val_size]
    test_data = converted_data[train_size + val_size:]
    
    # Write to output files
    with open(os.path.join(output_dir, 'train.txt'), 'w', encoding='utf-8') as f:
        f.write('\n'.join(train_data))
    
    with open(os.path.join(output_dir, 'valid.txt'), 'w', encoding='utf-8') as f:
        f.write('\n'.join(valid_data))
    
    with open(os.path.join(output_dir, 'test.txt'), 'w', encoding='utf-8') as f:
        f.write('\n'.join(test_data))
    
    print(f"Conversion complete. Created {len(train_data)} train, {len(valid_data)} validation, and {len(test_data)} test samples.")

# Define input and output paths
input_file = r"C:\Users\gungi\OneDrive\Desktop\Tugas Akhir\Code\Coba ABSA\2. Coba ABSA\data12_2500_NEWEST\absa_2500data_newest.json"
output_dir = r"C:\Users\gungi\OneDrive\Desktop\Tugas Akhir\Code\Coba ABSA\2. Coba ABSA\data12_2500_NEWEST"

# Execute conversion
convert_label_studio_to_pyabsa(input_file, output_dir)

Conversion complete. Created 1757 train, 376 validation, and 378 test samples.
