In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import string

In [2]:
def replace_polish_lowercase(text):
    polish_to_latin = {
        'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l', 'ń': 'n', 
        'ó': 'o', 'ś': 's', 'ź': 'z', 'ż': 'z'
    }
    return ''.join(polish_to_latin.get(char, char) for char in text)

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove polish letters
    text = replace_polish_lowercase(text)
    # Tokenize (split by whitespace)
    tokens = text.split()
    return ' '.join(tokens)  # Join tokens back into a string for TF-IDF

In [3]:
FILENAME_DEV = './data/dataset_conll/all.sentence.dev.txt'
FILENAME_TRAIN = './data/dataset_conll/all.sentence.train.txt'
FILENAME_TEST = './data/dataset_conll/all.sentence.test.txt'

LABELS = {
    "__label__z_minus_m": "Negative sentiment", 
    "__label__z_plus_m": "Positive sentiment",
    "__label__z_zero": "No sentiment",
    "__label__z_amb": "Unsure",
    }


In [4]:
def load_df(filename):
    with open(filename, 'r', encoding="utf8") as f:
        lines = f.readlines()
        data = [line.strip() for line in lines]
        # get last word from each item as a label and match with data
        labeled_data = []
        for item in data:
            label = item.split()[-1]
            sentence_data = ' '.join(item.split()[:-1])
            labeled_data.append((sentence_data, LABELS[label]))

        # create a dataframe
        df = pd.DataFrame(labeled_data, columns=['sentence', 'label'])
        return df

# Read data

In [5]:
df_train = load_df(FILENAME_TRAIN)
df_test = load_df(FILENAME_TEST)

Early preprocessing

In [6]:
df_train['processed_sentence'] = df_train['sentence'].apply(preprocess_text)
df_test['processed_sentence'] = df_test['sentence'].apply(preprocess_text)

In [7]:
df_train.sample(5)

Unnamed: 0,sentence,label,processed_sentence
34228,Na plus położenie pensjonatu blisko deptaka i ...,Positive sentiment,na plus polozenie pensjonatu blisko deptaka i ...
10002,Pobyt w hotelu zaliczam do bardzo nieudanego .,Negative sentiment,pobyt w hotelu zaliczam do bardzo nieudanego
8308,Operację przeprowadzono przy pomocy robota da ...,No sentiment,operacje przeprowadzono przy pomocy robota da ...
32896,Zgadzam się w zupełności z powyższymi opiniami .,No sentiment,zgadzam sie w zupelnosci z powyzszymi opiniami
16361,Na plus w pokoju codziennie uzupełniana woda .,Positive sentiment,na plus w pokoju codziennie uzupelniana woda


In [8]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(df_train['processed_sentence'])  # Fit on train data
X_test = vectorizer.transform(df_test['processed_sentence'])  # Transform test data

In [17]:
# One-hot encode the labels
encoder = OneHotEncoder(sparse=False)

y_train_original = df_train[['label']]
y_test_original = df_test[['label']]

y_train = encoder.fit_transform(y_train_original)  # Fit on train data labels
y_test = encoder.transform(y_test_original)       # Transform test data labels

# Classification

In [10]:
# Decision Tree Classifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)  # Fit only on train data

DecisionTreeClassifier()

In [11]:
y_pred = classifier.predict(X_test)

In [12]:
# Get bit mask where prediction isn't valid one hot encoding
valid_mask = y_pred.sum(axis=1) == 1
y_pred = y_pred[valid_mask]
y_test = y_test[valid_mask]
print(f"Couldn't predict {len(df_test) - len(y_pred)} samples")

Couldn't predict 4 samples


In [13]:
y_pred_labels = encoder.inverse_transform(y_pred)
y_test_labels = encoder.inverse_transform(y_test)

In [14]:
accuracy = accuracy_score(y_test_labels, y_pred_labels)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.53


# Gradient boosting

In [15]:
from sklearn.ensemble import GradientBoostingClassifier

In [18]:
gradient_classifier = GradientBoostingClassifier()
gradient_classifier.fit(X_train, y_train_original)

  return f(*args, **kwargs)


GradientBoostingClassifier()

In [19]:
y_pred_gradient = gradient_classifier.predict(X_test)

In [20]:
accuracy_gradient = accuracy_score(y_test_original, y_pred_gradient)
print(f"Gradient Boosting Accuracy: {accuracy_gradient:.2f}")

Gradient Boosting Accuracy: 0.57
