In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import string

In [16]:
def replace_polish_lowercase(text):
    polish_to_latin = {
        'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l', 'ń': 'n', 
        'ó': 'o', 'ś': 's', 'ź': 'z', 'ż': 'z'
    }
    return ''.join(polish_to_latin.get(char, char) for char in text)

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove polish letters
    text = replace_polish_lowercase(text)
    # Tokenize (split by whitespace)
    tokens = text.split()
    return ' '.join(tokens)  # Join tokens back into a string for TF-IDF

In [17]:
FILENAME_DEV = './data/dataset_conll/all.sentence.dev.txt'
FILENAME_TRAIN = './data/dataset_conll/all.sentence.train.txt'
FILENAME_TEST = './data/dataset_conll/all.sentence.test.txt'

LABELS = {
    "__label__z_minus_m": "Negative sentiment", 
    "__label__z_plus_m": "Positive sentiment",
    "__label__z_zero": "No sentiment",
    "__label__z_amb": "Unsure",
    }


In [18]:
def load_df(filename):
    with open(filename, 'r', encoding="utf8") as f:
        lines = f.readlines()
        data = [line.strip() for line in lines]
        # get last word from each item as a label and match with data
        labeled_data = []
        for item in data:
            label = item.split()[-1]
            sentence_data = ' '.join(item.split()[:-1])
            labeled_data.append((sentence_data, LABELS[label]))

        # create a dataframe
        df = pd.DataFrame(labeled_data, columns=['sentence', 'label'])
        return df

# Read data

In [19]:
df_train = load_df(FILENAME_TRAIN)
df_test = load_df(FILENAME_TEST)

Early preprocessing

In [20]:
df_train['processed_sentence'] = df_train['sentence'].apply(preprocess_text)
df_test['processed_sentence'] = df_test['sentence'].apply(preprocess_text)

In [21]:
df_train.sample(5)

Unnamed: 0,sentence,label,processed_sentence
31563,Jak takie częste psucie ma się do ekologii dro...,Negative sentiment,jak takie czeste psucie ma sie do ekologii dro...
44009,"Wejście po schodach , budynek i okolica w porz...",Positive sentiment,wejscie po schodach budynek i okolica w porzadku
43068,""" Jeśli zmiany łuszczycowe występują na twarzy...",No sentiment,jesli zmiany luszczycowe wystepuja na twarzy c...
15229,"Zastanawiała m się nad tabletkami , póżniej na...",No sentiment,zastanawiala m sie nad tabletkami pozniej nad ...
31540,Poszła m po rozum do głowy i zarejestrowała m ...,No sentiment,poszla m po rozum do glowy i zarejestrowala m ...


In [22]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(df_train['processed_sentence'])  # Fit on train data
X_test = vectorizer.transform(df_test['processed_sentence'])  # Transform test data

In [30]:
# One-hot encode the labels
encoder = OneHotEncoder(sparse=False)
y_train = encoder.fit_transform(df_train[['label']])  # Fit on train data labels
y_test = encoder.transform(df_test[['label']])       # Transform test data labels

# Classification

In [24]:
# Decision Tree Classifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)  # Fit only on train data

DecisionTreeClassifier()

In [25]:
y_pred = classifier.predict(X_test)

In [32]:
# Get bit mask where prediction isn't valid one hot encoding
valid_mask = y_pred.sum(axis=1) == 1
y_pred = y_pred[valid_mask]
y_test = y_test[valid_mask]
print(f"Couldn't predict {len(df_test) - len(y_pred)} samples")

Couldn't predict 5 samples


In [33]:
y_pred_labels = encoder.inverse_transform(y_pred)
y_test_labels = encoder.inverse_transform(y_test)

In [34]:
accuracy = accuracy_score(y_test_labels, y_pred_labels)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.53
