In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import string

In [2]:
def replace_polish_lowercase(text):
    polish_to_latin = {
        'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l', 'ń': 'n', 
        'ó': 'o', 'ś': 's', 'ź': 'z', 'ż': 'z'
    }
    return ''.join(polish_to_latin.get(char, char) for char in text)

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove polish letters
    text = replace_polish_lowercase(text)
    # Tokenize (split by whitespace)
    tokens = text.split()
    return ' '.join(tokens)  # Join tokens back into a string for TF-IDF

In [3]:
FILENAME_DEV = './data/dataset_conll/all.sentence.dev.txt'
FILENAME_TRAIN = './data/dataset_conll/all.sentence.train.txt'
FILENAME_TEST = './data/dataset_conll/all.sentence.test.txt'

LABELS = {
    "__label__z_minus_m": "Negative sentiment", 
    "__label__z_plus_m": "Positive sentiment",
    "__label__z_zero": "No sentiment",
    "__label__z_amb": "Unsure",
    }


In [4]:
def load_df(filename):
    with open(filename, 'r', encoding="utf8") as f:
        lines = f.readlines()
        data = [line.strip() for line in lines]
        # get last word from each item as a label and match with data
        labeled_data = []
        for item in data:
            label = item.split()[-1]
            sentence_data = ' '.join(item.split()[:-1])
            labeled_data.append((sentence_data, LABELS[label]))

        # create a dataframe
        df = pd.DataFrame(labeled_data, columns=['sentence', 'label'])
        return df

# Read data

In [5]:
df_train = load_df(FILENAME_TRAIN)
df_test = load_df(FILENAME_TEST)

Early preprocessing

In [6]:
df_train['processed_sentence'] = df_train['sentence'].apply(preprocess_text)
df_test['processed_sentence'] = df_test['sentence'].apply(preprocess_text)

In [7]:
df_train.sample(5)

Unnamed: 0,sentence,label,processed_sentence
17279,Po tygodniu pytanie ponowił em .,No sentiment,po tygodniu pytanie ponowil em
4164,Niestety nie można też otwierać okien w pokoja...,Negative sentiment,niestety nie mozna tez otwierac okien w pokojach
11742,Inny endokrynolog od razu stwierdził że przy t...,Negative sentiment,inny endokrynolog od razu stwierdzil ze przy t...
13645,"Leżaki od rana pozajmowane , nie ma gdzie się ...",Negative sentiment,lezaki od rana pozajmowane nie ma gdzie sie po...
6680,""" Ich badania pomogły w zrozumieniu wielu proc...",No sentiment,ich badania pomogly w zrozumieniu wielu proces...


# Classification

In [8]:
import google.generativeai as genai
import os
import enum
from typing_extensions import TypedDict

class Choice(enum.Enum):
    Negative = "Negative sentiment"
    Positive = "Positive sentiment"
    NoSentiment = "No sentiment"
    Unsure = "Unsure"
class Sentiment(TypedDict):
    sentence_id: int
    sentiment: Choice
    
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
model = genai.GenerativeModel("gemini-1.5-flash")
generation_config={"response_mime_type": "application/json",
                   "response_schema": list[Sentiment]}




In [9]:
output_dir = "output_chunks"
os.makedirs(output_dir, exist_ok=True)

chunks = [df_test.iloc[i:i + 400] for i in range(0, len(df_test), 400)]

# for i, chunk in enumerate(chunks):
#     query = '\n'.join(f"{index}: {row['processed_sentence']}" for index, row in chunk.iterrows())
#     response = model.generate_content(
#         ["Classify the sentiment for each sentence:", query],
#         generation_config=generation_config
#     )
    
#     output_file = os.path.join(output_dir, f"chunk_{i + 1}.txt")
#     with open(output_file, "w", encoding="utf-8") as f:
#         f.write(response.text)

In [10]:
import json

combined_data = []

for filename in os.listdir(output_dir):
    if filename.endswith('.txt'):
        filepath = os.path.join(output_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            data = json.load(file)
            combined_data.extend(data)

output_filepath = os.path.join(output_dir, 'combined_data.json')
with open(output_filepath, 'w', encoding='utf-8') as output_file:
    json.dump(combined_data, output_file, indent=4)

In [12]:
df = pd.DataFrame(combined_data)
df.head()

Unnamed: 0,sentence_id,sentiment
0,0,Negative sentiment
1,1,Negative sentiment
2,2,No sentiment
3,3,Negative sentiment
4,4,Positive sentiment


In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score

df_test['sentence_id'] = df_test.index
df_merged = pd.merge(df_test, df, on='sentence_id')
df_merged['match'] = df_merged['label'] == df_merged['sentiment']
accuracy = accuracy_score(df_merged['label'], df_merged['sentiment'])
print(f'Accuracy: {accuracy:.2f}')
cm = confusion_matrix(df_merged['label'], df_merged['sentiment'], labels=df_test['label'].unique())
print('Confusion Matrix:')
print(cm)

Accuracy: 0.75
Confusion Matrix:
[[1937   56  125    4]
 [  24 1430   67    1]
 [ 210  269  937    1]
 [ 280  231  161    9]]
