In [1]:
import pandas as pd
import re
import unicodedata
import contractions
import requests
from transformers import pipeline, AutoTokenizer
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

2025-03-19 20:30:49.368148: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-19 20:30:49.863424: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742391050.025752   13591 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742391050.075572   13591 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-19 20:30:50.405977: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
# Cleaning dulu sebelum dikasih ke model HuggingFace
# Please, CLEANING INI KEWAJIBAN karena main ke model HuggingFace orang lain

english_words = set(words.words())
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def loadCustomDict(path):
    with open(path, 'r') as file:
        return set(line.strip().lower() for line in file if line.strip())

def normalizeWhitespace(text):
    text = unicodedata.normalize('NFKC', text)
    text = contractions.fix(text)
    text = re.sub(r'[\t\r]+', ' ', text) # Menghapus tab
    text = re.sub(r'\b\d+\b', '', text) # Menghilangkan angka
    text = re.sub(r'[-‐‑‒–—―]+', '', text)
    text = re.sub(r'[_﹍﹎＿]', '', text)
    text = re.sub(r'[^\w\s]', '', text) # Hilangkan symbol punctuation
    text = re.sub(r'\b(\w+)(?:\s+\1\b)+', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip().lower()
    return text

def removeNonEnglish(text_series, custom_dict):
    pattern = r'\b(?:' + '|'.join(re.escape(word) for word in custom_dict) + r')\b'
    temp_series = text_series.str.replace(pattern, '', case=False, regex=True)
    split_words = temp_series.str.split()
    exploded = split_words.explode()
    exploded = exploded[exploded.str.lower().isin(english_words)]
    filtered = exploded[~exploded.str.lower().isin(stop_words)]
    lemmatized = filtered.apply(lambda word: lemmatizer.lemmatize(word.lower()))
    cleaned_text_series = lemmatized.groupby(level=0).agg(' '.join)
    pattern2 = r'\b(\w+)(?:\s+\1\b)+' #, r'\1', text)
    ser = cleaned_text_series.reindex(text_series.index, fill_value='')
    text = ser.str.replace(pattern2, r'\1', case=False, regex=True)
    return text

def removeOtherLanguage(text):
    phrase = ' translated'
    pos = text.find(phrase)
    if pos != -1:
        text = text[:pos].rstrip()
    text = re.sub(r'\b\w*[^\x00-\x7F]\w*\b', '', text)
    text = re.sub(r'\s+', ' ', text).strip().lower()
    return text

def simpleCleaning():
    custom_dict = loadCustomDict('custom_vocab.txt')
    df = pd.read_csv('./dataset/poem_dataset.csv')
    copyDf = df.copy()
    copyDf['poem'] = copyDf['poem'].apply(normalizeWhitespace)
    copyDf['poem'] = copyDf['poem'].apply(removeOtherLanguage)
    copyDf['poem'] = removeNonEnglish(copyDf['poem'], custom_dict)
    return df, copyDf


In [3]:
# Cleaning dataset
df, cleanDf = simpleCleaning()

# Labelling Using HuggingFace Model J-Hartmann Distilroberta-base

In [4]:
emotion_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")
tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")

I0000 00:00:1739943398.443653   38238 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at j-hartmann/emotion-english-distilroberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
Device set to use 0


In [None]:
def predictSentimentMergeHartmann(df, cleanDf):
    labels = []
    scores = []
    for idx, poem in enumerate(cleanDf.values):
        tokens = tokenizer(poem[0], max_length=512, truncation=True, return_tensors="tf")
        result = emotion_classifier(tokenizer.decode(tokens['input_ids'][0], truncation=True))[0]
        print(f"Predict ke-{idx}. Label : {result['label']}. Score : {round(result['score'], 5)}")
        scores.append(round(result['score'], 5))
        labels.append(result['label'])

    print("Merging")
    pd.concat([df, pd.DataFrame({'label':labels, 'score':scores})], axis=1).to_csv("./dataset/labelled_poem_hartmann.csv", index=False)
    

In [None]:
final = predictSentimentMergeHartmann(df, cleanDf)

Predict ke-0. Label : surprise. Score : 0.83405
Predict ke-1. Label : fear. Score : 0.57097
Predict ke-2. Label : sadness. Score : 0.9278
Predict ke-3. Label : sadness. Score : 0.77741
Predict ke-4. Label : anger. Score : 0.58882
Predict ke-5. Label : sadness. Score : 0.98263
Predict ke-6. Label : joy. Score : 0.79546
Predict ke-7. Label : fear. Score : 0.96828
Predict ke-8. Label : fear. Score : 0.86059
Predict ke-9. Label : anger. Score : 0.96589
Predict ke-10. Label : fear. Score : 0.64629
Predict ke-11. Label : anger. Score : 0.92384
Predict ke-12. Label : sadness. Score : 0.92641
Predict ke-13. Label : anger. Score : 0.93367
Predict ke-14. Label : sadness. Score : 0.98694
Predict ke-15. Label : fear. Score : 0.36186
Predict ke-16. Label : sadness. Score : 0.62557
Predict ke-17. Label : fear. Score : 0.93475
Predict ke-18. Label : sadness. Score : 0.81315
Predict ke-19. Label : sadness. Score : 0.92017
Predict ke-20. Label : anger. Score : 0.66822
Predict ke-21. Label : joy. Score 

Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors


Predict ke-26. Label : neutral. Score : 0.54312
Predict ke-27. Label : joy. Score : 0.9262
Predict ke-28. Label : sadness. Score : 0.88566
Predict ke-29. Label : sadness. Score : 0.64181
Predict ke-30. Label : sadness. Score : 0.41925
Predict ke-31. Label : fear. Score : 0.6899
Predict ke-32. Label : sadness. Score : 0.80871
Predict ke-33. Label : fear. Score : 0.87352
Predict ke-34. Label : joy. Score : 0.58043
Predict ke-35. Label : sadness. Score : 0.97432
Predict ke-36. Label : neutral. Score : 0.32475
Predict ke-37. Label : joy. Score : 0.49126
Predict ke-38. Label : joy. Score : 0.95761
Predict ke-39. Label : fear. Score : 0.99117
Predict ke-40. Label : sadness. Score : 0.98941
Predict ke-41. Label : anger. Score : 0.30843
Predict ke-42. Label : joy. Score : 0.489
Predict ke-43. Label : fear. Score : 0.48945
Predict ke-44. Label : sadness. Score : 0.97948
Predict ke-45. Label : sadness. Score : 0.96537
Predict ke-46. Label : disgust. Score : 0.52583
Predict ke-47. Label : sadness

# Labelling Using HuggingFace Model Bhadresh-Savani Distilbert-base

In [23]:
emotion_classifier = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion")
tokenizer = AutoTokenizer.from_pretrained("bhadresh-savani/distilbert-base-uncased-emotion")

config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

I0000 00:00:1740012489.139793    2040 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use 0


In [None]:
def predictSentimentMergeSavani(df, cleanDf):
    labels = []
    scores = []
    for idx, poem in enumerate(cleanDf.values):
        tokens = tokenizer(poem[0], max_length=512, truncation=True, return_tensors="tf")
        result = emotion_classifier(tokenizer.decode(tokens['input_ids'][0], truncation=True))[0]
        print(f"Predict ke-{idx}. Label : {result['label']}. Score : {round(result['score'], 5)}")
        scores.append(round(result['score'], 5))
        labels.append(result['label'])

    print("Merging")
    pd.concat([df, pd.DataFrame({'label':labels, 'score':scores})], axis=1).to_csv("./dataset/labelled_poem_savani.csv", index=False)
    

In [None]:
final = predictSentimentMergeSavani(df, cleanDf)

Predict ke-0. Label : joy. Score : 0.95379
Predict ke-1. Label : sadness. Score : 0.68797
Predict ke-2. Label : joy. Score : 0.44294
Predict ke-3. Label : love. Score : 0.65365
Predict ke-4. Label : fear. Score : 0.78596
Predict ke-5. Label : love. Score : 0.43306
Predict ke-6. Label : joy. Score : 0.76008
Predict ke-7. Label : fear. Score : 0.99217
Predict ke-8. Label : love. Score : 0.92609
Predict ke-9. Label : anger. Score : 0.87743
Predict ke-10. Label : joy. Score : 0.46513
Predict ke-11. Label : sadness. Score : 0.99594
Predict ke-12. Label : sadness. Score : 0.55402
Predict ke-13. Label : joy. Score : 0.95946
Predict ke-14. Label : sadness. Score : 0.73611
Predict ke-15. Label : fear. Score : 0.50621
Predict ke-16. Label : sadness. Score : 0.9784
Predict ke-17. Label : sadness. Score : 0.98938
Predict ke-18. Label : sadness. Score : 0.9985
Predict ke-19. Label : sadness. Score : 0.99349
Predict ke-20. Label : sadness. Score : 0.51373
Predict ke-21. Label : joy. Score : 0.48168


Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors


Predict ke-26. Label : love. Score : 0.78738
Predict ke-27. Label : joy. Score : 0.94901
Predict ke-28. Label : joy. Score : 0.41169
Predict ke-29. Label : anger. Score : 0.72846
Predict ke-30. Label : sadness. Score : 0.85425
Predict ke-31. Label : fear. Score : 0.53317
Predict ke-32. Label : anger. Score : 0.65419
Predict ke-33. Label : anger. Score : 0.37558
Predict ke-34. Label : joy. Score : 0.84776
Predict ke-35. Label : sadness. Score : 0.43464
Predict ke-36. Label : anger. Score : 0.77608
Predict ke-37. Label : joy. Score : 0.9458
Predict ke-38. Label : anger. Score : 0.82863
Predict ke-39. Label : anger. Score : 0.61781
Predict ke-40. Label : sadness. Score : 0.62551
Predict ke-41. Label : fear. Score : 0.56249
Predict ke-42. Label : joy. Score : 0.41387
Predict ke-43. Label : joy. Score : 0.82728
Predict ke-44. Label : joy. Score : 0.97224
Predict ke-45. Label : sadness. Score : 0.95176
Predict ke-46. Label : sadness. Score : 0.90746
Predict ke-47. Label : joy. Score : 0.6811

## Attention

HuggingFace model used in this project return 2 things:

1. The `label` indicates the label predicted by model
2. The `score` indicates the confident of model predicting the data

# Labelling Using DeepSeek-Coder-v2-Lite-Instruct via LM Studio

In [2]:
# Cleaning dulu sebelum dikasih ke model HuggingFace
# Please, CLEANING INI KEWAJIBAN karena main ke model HuggingFace orang lain

def loadCustomDict(path):
    with open(path, 'r') as file:
        return set(line.strip().lower() for line in file if line.strip())

def normalizeWhitespace(text):
    text = unicodedata.normalize('NFKC', text)
    text = contractions.fix(text)
    text = re.sub(r'[\t\r]+', ' ', text) # Menghapus tab
    text = re.sub(r'\b\d+\b', '', text) # Menghilangkan angka
    text = re.sub(r'[^\w\s]', '', text) # Hilangkan symbol punctuation
    text = re.sub(r'\b(\w+)(?:\s+\1\b)+', r'\1', text) #hilangin double
    text = re.sub(r'\s+', ' ', text).strip().lower()
    return text

def removeOtherLanguage(text):
    phrase = ' translated'
    pos = text.find(phrase)
    if pos != -1:
        text = text[:pos].rstrip()
    text = re.sub(r'\b\w*[^\x00-\x7F]\w*\b', '', text)
    text = re.sub(r'\s+', ' ', text).strip().lower()
    return text

def simpleCleaning():
    df = pd.read_csv('./dataset/poem_dataset.csv')
    copyDf = df.copy()
    copyDf['poem'] = copyDf['poem'].apply(normalizeWhitespace)
    copyDf['poem'] = copyDf['poem'].apply(removeOtherLanguage)
    return df, copyDf

# Cleaning dataset
df, cleanDf = simpleCleaning()


In [3]:
def truncate_text(text, max_length):
    if len(text) <= max_length:
        return text
    return text[:max_length]

def predictSentimentMergeDeepSeek(df, cleanDf):
    labels = []
    for idx, poem in enumerate(cleanDf.values):
        url = "http://192.168.1.10:55500/v1/chat/completions"
        truncated_poem = truncate_text(poem, 4000)
        headers = {
            "Content-Type": "application/json"
        }
        payload = {
            "model": "deepseek-coder-v2-lite-instruct",
            "messages": [
                {
                    "role": "system", 
                    "content": "You are a poem analysis assistant. Your task is to analyze the emotion of a given poem and assign it one of the following labels: love, joy, sadness, hope, or other. You must only respond with one of these labels and nothing else. Do not provide any additional information, context, nor starting and opening sentence."
                },
                {
                    "role": "user",
                    "content": f"Analyze the following poem and assign it one of the following labels: love, joy, sadness, hope, or other. Respond with only the label and nothing else. Poem : {truncated_poem}",
                }
            ],
            "temperature": 0.5, 
            "max_tokens": -1 
        }
        response = requests.post(url, headers=headers, json=payload)
        if response.status_code == 200:
            response_data = response.json()
            reply = response_data['choices'][0]['message']['content']
            print(f"Predict ke-{idx}. Reply : {reply.lower()}")
            labels.append(reply.lower())
        else:
            print(f"Error: {response.status_code} - {response.text}")
            labels.append(' none')

    print("Merging")
    pd.concat([df, pd.DataFrame({'label':labels})], axis=1).to_csv("./dataset/labelled_poem_deepseek2.csv", index=False)
    

In [4]:
final = predictSentimentMergeDeepSeek(df, cleanDf)

Predict ke-0. Reply :  love
Predict ke-1. Reply :  other
Predict ke-2. Reply :  other.
Predict ke-3. Reply :  love.
Predict ke-4. Reply :  sad
Predict ke-5. Reply :  hope.
Predict ke-6. Reply :  love
Predict ke-7. Reply :  other
Predict ke-8. Reply :  hope.
Predict ke-9. Reply :  sadness
Predict ke-10. Reply :  other.
Predict ke-11. Reply :  other
Predict ke-12. Reply :  sadness
Predict ke-13. Reply :  other
Predict ke-14. Reply :  sadness
Predict ke-15. Reply :  hope.
Predict ke-16. Reply :  other
Predict ke-17. Reply :  other
Predict ke-18. Reply :  sadness
Predict ke-19. Reply :  sadness
Predict ke-20. Reply :  hope.
Predict ke-21. Reply :  other.
Predict ke-22. Reply :  other
Predict ke-23. Reply :  other.
Predict ke-24. Reply :  joy
Predict ke-25. Reply :  joy.
Predict ke-26. Reply :  hope.
Predict ke-27. Reply :  hope.
Predict ke-28. Reply :  sadness.
Predict ke-29. Reply :  joy.
Predict ke-30. Reply :  other
Predict ke-31. Reply :  sad
Predict ke-32. Reply :  hope.
Predict ke-33

In [4]:
tes = ' uio'
tes.replace('.','').replace(' ','')

'uio'

In [2]:

import pandas as pd
zz = pd.read_csv('./dataset/labelled_poem_deepseek2.csv')
zz['label'].str.replace('.', '').replace(' ', '').value_counts() #.to_csv('./dataset/tes.csv', index=False)

label
other               5634
joy                 3695
sadness             3512
hope                2591
love                1229
sad                  546
none                 157
sorrow                12
melancholy             3
anger                  3
valley                 2
proud                  2
cat                    1
scunge                 1
sorrowful              1
jazz                   1
rose                   1
cruelty                1
dancing girl           1
desolate               1
democracy              1
land                   1
war                    1
loneliness             1
innocence              1
joyful                 1
rhymerustler           1
joyless                1
despair                1
river                  1
horror                 1
wave                   1
patriotism             1
rage                   1
prophet                1
pioneer                1
blues and greens       1
insomnia               1
Name: count, dtype: int64

In [11]:
zz = pd.read_csv('./dataset/tes.csv')
zz.value_counts()

label     
melancholy    4860
hope          1828
joy           1031
sad            988
mourn          719
              ... 
dank             1
death            1
deceit           1
languid          1
marvelous        1
Name: count, Length: 656, dtype: int64