### 1. Import libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import jieba
import joblib

In [4]:
def preprocess_text(text):
    words = jieba.cut(text)
    return ' '.join(words)

def convert_sentiment(score):
    if score == -2:
        return 'not_mentioned'
    elif score == -1:
        return 'negative'
    elif score == 0:
        return 'neutral'
    else:  # score == 1
        return 'positive'

def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    
    # Define aspects, e.g. Food#Appearance, Service#Price, etc.
    aspect_columns = [col for col in df.columns if col not in ['id', 'review', 'star']]
    y = df[aspect_columns]

    # Convert sentiment scores to categorical labels
    y = df[aspect_columns].astype('object')
    for col in y.columns:
        y.loc[:, col] = y[col].apply(convert_sentiment)

    # Data preprocessing
    df['processed_review'] = df['review'].apply(preprocess_text)
    
    return df['processed_review'], y, aspect_columns

train_path = "../data/train.csv"
dev_path = "../data/dev.csv"
test_path = "../data/test.csv"

X_train, y_train, aspect_columns = load_and_preprocess_data(train_path)
X_dev, y_dev, _ = load_and_preprocess_data(dev_path)
X_test, y_test, _ = load_and_preprocess_data(test_path)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.322 seconds.
Prefix dict has been built successfully.


In [7]:
print(f"Training data shape: {X_train.shape}, {y_train.shape}\n")

# print example of training data
print("Example of training data:")
for i in range(5):
    print(f"Review: {X_train.iloc[i][:20]}, Labels: {y_train.iloc[i].to_dict()}")

Training data shape: (36850,), (36850, 18)

Example of training data:
Review: 状元 楼 饭店 第一次 去 ， 因为 地, Labels: {'Location#Transportation': 'positive', 'Location#Downtown': 'positive', 'Location#Easy_to_find': 'positive', 'Service#Queue': 'not_mentioned', 'Service#Hospitality': 'positive', 'Service#Parking': 'not_mentioned', 'Service#Timely': 'not_mentioned', 'Price#Level': 'not_mentioned', 'Price#Cost_effective': 'not_mentioned', 'Price#Discount': 'not_mentioned', 'Ambience#Decoration': 'positive', 'Ambience#Noise': 'not_mentioned', 'Ambience#Space': 'not_mentioned', 'Ambience#Sanitary': 'not_mentioned', 'Food#Portion': 'not_mentioned', 'Food#Taste': 'positive', 'Food#Appearance': 'not_mentioned', 'Food#Recommend': 'not_mentioned'}
Review: 我 最 爱 他们 家 的 猪手 ， 麻辣, Labels: {'Location#Transportation': 'positive', 'Location#Downtown': 'not_mentioned', 'Location#Easy_to_find': 'not_mentioned', 'Service#Queue': 'not_mentioned', 'Service#Hospitality': 'positive', 'Service#Parking': 'not_mentioned'