### 1. Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import jieba
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib

### 2. Load data and preprocess

In [None]:
def preprocess_text(text):
    words = jieba.cut(text)
    return ' '.join(words)

def convert_sentiment(score):
    if score == -2:
        return 'not_mentioned'
    elif score == -1:
        return 'negative'
    elif score == 0:
        return 'neutral'
    else:  # score == 1
        return 'positive'

def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    
    # Define aspects, e.g. Food#Appearance, Service#Price, etc.
    aspect_columns = [col for col in df.columns if col not in ['id', 'review', 'star']]
    y = df[aspect_columns]

    # Convert sentiment scores to categorical labels
    y = df[aspect_columns].astype('object')
    for col in y.columns:
        y.loc[:, col] = y[col].apply(convert_sentiment)

    # Data preprocessing
    df['processed_review'] = df['review'].apply(preprocess_text)
    
    return df['processed_review'], y, aspect_columns

train_path = "../data/train.csv"
dev_path = "../data/dev.csv"
test_path = "../data/test.csv"

X_train, y_train, aspect_columns = load_and_preprocess_data(train_path)
X_dev, y_dev, _ = load_and_preprocess_data(dev_path)
X_test, y_test, _ = load_and_preprocess_data(test_path)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.288 seconds.
Prefix dict has been built successfully.


In [None]:
print(f"Train shape: {X_train.shape}, {y_train.shape}")
print(f"Dev shape: {X_dev.shape}, {y_dev.shape}")
print(f"Test shape: {X_test.shape}, {y_test.shape}\n")

# print example of training data
print("Example of training data:")
for i in range(5):
    print(f"Review: {X_train.iloc[i][:20]}, Labels: {y_train.iloc[i].to_dict()}")

### 3. Exploratory data analysis

In [None]:
# Create a subdirectory for EDA plots
OUTPUT_DIR = "eda_plots"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Bar plot colors for sentiment categories
SENTIMENTS = ['not_mentioned', 'negative', 'neutral', 'positive']
SENTIMENT_COLORS = {
    'not_mentioned': '#808080',  # Gray
    'negative': '#FF0000',       # Red
    'neutral': '#1F77B4',        # Blue
    'positive': '#2CA02C'        # Green
}

def annotate_bars(ax):
    """Annotate bars with their heights, skipping zero-height bars."""
    for p in ax.patches:
        if p.get_height() > 0:
            ax.annotate(f'{int(p.get_height())}', 
                        (p.get_x() + p.get_width() / 2., p.get_height()), 
                        ha='center', va='bottom', xytext=(0, 5), textcoords='offset points')

def perform_eda(y, dataset_name):
    
    # 1. Aspect Mention Frequency
    mention_freq = (y != 'not_mentioned').mean()
    plt.figure(figsize=(12, 6))
    ax = sns.barplot(x=mention_freq.index, y=mention_freq.values)
    plt.title(f"Aspect Mention Frequency in {dataset_name} Dataset")
    plt.xticks(rotation=45, ha='right')
    plt.ylabel("Proportion of Reviews Mentioning Aspect")
    annotate_bars(ax)
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, f"aspect_mention_frequency_{dataset_name}.png"))
    plt.close()
    
    # 2. Sentiment Distribution for Each Aspect with fixed order and colors
    for aspect in y.columns:
        plt.figure(figsize=(8, 6))
        ax = sns.countplot(data=y, x=aspect, order=SENTIMENTS, hue=aspect, palette=SENTIMENT_COLORS, legend=False)
        plt.title(f"Sentiment Distribution for {aspect} in {dataset_name} Dataset")
        plt.xlabel("Sentiment")
        plt.ylabel("Count")
        annotate_bars(ax) 
        plt.savefig(os.path.join(OUTPUT_DIR, f"sentiment_distribution_{aspect}_{dataset_name}.png"))
        plt.close()
    
    # 3. Distribution of Number of Aspects Mentioned per Review
    num_aspects_mentioned = (y != 'not_mentioned').sum(axis=1)
    plt.figure(figsize=(10, 6))
    ax = sns.histplot(num_aspects_mentioned, bins=range(0, len(y.columns)+1), kde=False)
    plt.title(f"Distribution of Number of Aspects Mentioned per Review in {dataset_name} Dataset")
    plt.xlabel("Number of Aspects Mentioned")
    plt.ylabel("Number of Reviews")
    annotate_bars(ax)
    plt.savefig(os.path.join(OUTPUT_DIR, f"num_aspects_mentioned_{dataset_name}.png"))
    plt.close()
    
    # 4. Summary Statistics
    print(f"{dataset_name.capitalize()}")
    print(f"Total reviews: {len(y)}")
    avg_mentions = num_aspects_mentioned.mean()
    print(f"Average number of aspects mentioned per review: {avg_mentions:.2f}\n")

In [None]:
perform_eda(y_train, "train")
perform_eda(y_dev, "dev")
perform_eda(y_test, "test")