In [1]:
import pandas as pd
import re
import unicodedata
import contractions
import requests
from transformers import pipeline, AutoTokenizer
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

  from .autonotebook import tqdm as notebook_tqdm
2025-03-25 12:32:25.067725: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-25 12:32:25.081915: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742880745.097485   35975 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742880745.101668   35975 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742880745.114418   35975 computation_placer.cc:177] computation placer already r

In [2]:
# Cleaning dulu sebelum dikasih ke model HuggingFace
# Please, CLEANING INI KEWAJIBAN karena main ke model HuggingFace orang lain

english_words = set(words.words())
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def loadCustomDict(path):
    with open(path, 'r') as file:
        return set(line.strip().lower() for line in file if line.strip())

def normalizeWhitespace(text):
    text = unicodedata.normalize('NFKC', text)
    text = contractions.fix(text)
    text = re.sub(r'[\t\r]+', ' ', text) # Menghapus tab
    text = re.sub(r'\b\d+\b', '', text) # Menghilangkan angka
    text = re.sub(r'[-‐‑‒–—―]+', '', text)
    text = re.sub(r'[_﹍﹎＿]', '', text)
    text = re.sub(r'[^\w\s]', '', text) # Hilangkan symbol punctuation
    text = re.sub(r'\b(\w+)(?:\s+\1\b)+', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip().lower()
    return text

def removeNonEnglish(text_series, custom_dict):
    pattern = r'\b(?:' + '|'.join(re.escape(word) for word in custom_dict) + r')\b'
    temp_series = text_series.str.replace(pattern, '', case=False, regex=True)
    split_words = temp_series.str.split()
    exploded = split_words.explode()
    exploded = exploded[exploded.str.lower().isin(english_words)]
    filtered = exploded[~exploded.str.lower().isin(stop_words)]
    lemmatized = filtered.apply(lambda word: lemmatizer.lemmatize(word.lower()))
    cleaned_text_series = lemmatized.groupby(level=0).agg(' '.join)
    pattern2 = r'\b(\w+)(?:\s+\1\b)+' #, r'\1', text)
    ser = cleaned_text_series.reindex(text_series.index, fill_value='')
    text = ser.str.replace(pattern2, r'\1', case=False, regex=True)
    return text

def removeOtherLanguage(text):
    phrase = ' translated'
    pos = text.find(phrase)
    if pos != -1:
        text = text[:pos].rstrip()
    text = re.sub(r'\b\w*[^\x00-\x7F]\w*\b', '', text)
    text = re.sub(r'\s+', ' ', text).strip().lower()
    return text

def simpleCleaning():
    custom_dict = loadCustomDict('custom_vocab.txt')
    df = pd.read_csv('./dataset/poem_dataset.csv')
    copyDf = df.copy()
    copyDf['poem'] = copyDf['poem'].apply(normalizeWhitespace)
    copyDf['poem'] = copyDf['poem'].apply(removeOtherLanguage)
    copyDf['poem'] = removeNonEnglish(copyDf['poem'], custom_dict)
    return df, copyDf


In [3]:
# Cleaning dataset
df, cleanDf = simpleCleaning()

# Labelling Using HuggingFace Model J-Hartmann Distilroberta-base

In [None]:
emotion_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")
tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")

In [None]:
def predictSentimentMergeHartmann(df, cleanDf):
    labels = []
    scores = []
    for idx, poem in enumerate(cleanDf.values):
        tokens = tokenizer(poem[0], max_length=512, truncation=True, return_tensors="tf")
        result = emotion_classifier(tokenizer.decode(tokens['input_ids'][0], truncation=True))[0]
        print(f"Predict ke-{idx}. Label : {result['label']}. Score : {round(result['score'], 5)}")
        scores.append(round(result['score'], 5))
        labels.append(result['label'])

    print("Merging")
    pd.concat([df, pd.DataFrame({'label':labels, 'score':scores})], axis=1).to_csv("./dataset/labelled_poem_hartmann.csv", index=False)
    

In [None]:
final = predictSentimentMergeHartmann(df, cleanDf)

# Labelling Using HuggingFace Model Bhadresh-Savani Distilbert-base

In [None]:
emotion_classifier = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion")
tokenizer = AutoTokenizer.from_pretrained("bhadresh-savani/distilbert-base-uncased-emotion")

In [None]:
def predictSentimentMergeSavani(df, cleanDf):
    labels = []
    scores = []
    for idx, poem in enumerate(cleanDf.values):
        tokens = tokenizer(poem[0], max_length=512, truncation=True, return_tensors="tf")
        result = emotion_classifier(tokenizer.decode(tokens['input_ids'][0], truncation=True))[0]
        print(f"Predict ke-{idx}. Label : {result['label']}. Score : {round(result['score'], 5)}")
        scores.append(round(result['score'], 5))
        labels.append(result['label'])

    print("Merging")
    pd.concat([df, pd.DataFrame({'label':labels, 'score':scores})], axis=1).to_csv("./dataset/labelled_poem_savani.csv", index=False)
    

In [None]:
final = predictSentimentMergeSavani(df, cleanDf)

## Attention

HuggingFace model used in this project return 2 things:

1. The `label` indicates the label predicted by model
2. The `score` indicates the confident of model predicting the data

# Labelling Using DeepSeek-Coder-v2-Lite-Instruct via LM Studio

In [2]:
# Cleaning dulu sebelum dikasih ke model HuggingFace
# Please, CLEANING INI KEWAJIBAN karena main ke model HuggingFace orang lain

def loadCustomDict(path):
    with open(path, 'r') as file:
        return set(line.strip().lower() for line in file if line.strip())

def normalizeWhitespace(text):
    text = unicodedata.normalize('NFKC', text)
    text = contractions.fix(text)
    text = re.sub(r'[\t\r]+', ' ', text) # Menghapus tab
    text = re.sub(r'\b\d+\b', '', text) # Menghilangkan angka
    text = re.sub(r'[^\w\s]', '', text) # Hilangkan symbol punctuation
    text = re.sub(r'\b(\w+)(?:\s+\1\b)+', r'\1', text) #hilangin double
    text = re.sub(r'\s+', ' ', text).strip().lower()
    return text

def removeOtherLanguage(text):
    phrase = ' translated'
    pos = text.find(phrase)
    if pos != -1:
        text = text[:pos].rstrip()
    text = re.sub(r'\b\w*[^\x00-\x7F]\w*\b', '', text)
    text = re.sub(r'\s+', ' ', text).strip().lower()
    return text

def simpleCleaning():
    df = pd.read_csv('./dataset/poem_dataset.csv')
    copyDf = df.copy()
    copyDf['poem'] = copyDf['poem'].apply(normalizeWhitespace)
    copyDf['poem'] = copyDf['poem'].apply(removeOtherLanguage)
    return df, copyDf

def simpleCleaning2():
    df = pd.read_csv('./dataset/labelled_poem_deepseek.csv')
    copyDf = df.copy()
    copyDf['poem'] = copyDf['poem'].apply(normalizeWhitespace)
    copyDf['poem'] = copyDf['poem'].apply(removeOtherLanguage)
    return df, copyDf


In [None]:
# Cleaning dataset
df, cleanDf = simpleCleaning()

In [None]:
def truncate_text(text, max_length):
    if len(text) <= max_length:
        return text
    return text[:max_length]

def predictSentimentMergeDeepSeek(df, cleanDf):
    labels = []
    for idx, poem in enumerate(cleanDf.values):
        url = "http://192.168.1.10:55500/v1/chat/completions"
        truncated_poem = truncate_text(poem, 4000)
        headers = {
            "Content-Type": "application/json"
        }
        payload = {
            "model": "deepseek-coder-v2-lite-instruct",
            "messages": [
                {
                    "role": "system", 
                    "content": "You are a poem analysis assistant. Your task is to analyze the emotion of a given poem and assign it one of the following labels: love, joy, sadness, hope, or other. You must only respond with one of these labels and nothing else. Do not provide any additional information, context, nor starting and opening sentence."
                },
                {
                    "role": "user",
                    "content": f"Analyze the following poem and assign it one of the following labels: love, joy, sadness, hope, or other. Respond with only the label and nothing else. Poem : {truncated_poem}",
                }
            ],
            "temperature": 0.5, 
            "max_tokens": -1 
        }
        response = requests.post(url, headers=headers, json=payload)
        if response.status_code == 200:
            response_data = response.json()
            reply = response_data['choices'][0]['message']['content']
            print(f"Predict ke-{idx}. Reply : {reply.lower()}")
            labels.append(reply.lower())
        else:
            print(f"Error: {response.status_code} - {response.text}")
            labels.append(' none')

    print("Merging")
    pd.concat([df, pd.DataFrame({'label':labels})], axis=1).to_csv("./dataset/labelled_poem_deepseek.csv", index=False)
    

In [None]:
# final = predictSentimentMergeDeepSeek(df, cleanDf)

---

In [3]:
df, cleanDf = simpleCleaning2()

In [17]:
url = "http://192.168.1.8:55500/v1/chat/completions"
headers = {
            "Content-Type": "application/json"
        }

def truncate_text(text, max_length):
    if len(text) <= max_length:
        return text
    return text[:max_length]

def normalizeLabel(text):
    text = text.lower().replace('.', '').replace(' ','')
    return text
    

def rePredict(df, cleanDf):
    labels = []
    arr = ['love', 'joy', 'sad', 'sadness', 'hope', 'other']
    for idx, poem in enumerate(cleanDf.values):
        if poem[1] not in arr:
            while poem[1] not in arr:
                truncated_poem = truncate_text(poem[0], 4000)
                payload = {
                    "model": "deepseek-coder-v2-lite-instruct",
                    "messages": [
                        {
                            "role": "system", 
                            "content": "You are a poem analysis assistant. Your task is to analyze the emotion of a given poem and assign it one of the following labels: love, joy, sadness, hope, or other. You must only respond with one of these labels and nothing else. Do not provide any additional information, context, nor starting and opening sentence."
                        },
                        {
                            "role": "user",
                            "content": f"Analyze the following poem and assign it one of the following labels: love, joy, sadness, hope, or other. Respond with only the label and nothing else. Poem : {truncated_poem}",
                        }
                    ],
                    "temperature": 0.7, 
                    "max_tokens": -1 
                }
                response = requests.post(url, headers=headers, json=payload)
                if response.status_code == 200:
                    response_data = response.json()
                    reply = response_data['choices'][0]['message']['content'].lower().replace('.', '').replace(' ','')
                    if reply in arr:
                        labels.append(reply)
                        print(f"Predict ke-{idx}. Reply : {reply}. Safe Next")
                        break
                    print(f"Predict ke-{idx}. Reply : {reply}. Try Again")
                else:
                    print(f"Error: {response.status_code} - {response.text}. Try again")
            print("Next======================================")
        else:
            print(f"Predict ke-{idx}. Label : {poem[1]}. Safe Skip")
            labels.append(poem[1])
    
    print("Merging")
    pd.concat([df['poem'], pd.DataFrame({'label':labels})], axis=1).to_csv("./dataset/labelled_poem_deepseek_clean.csv", index=False)

In [5]:
cleanDf['label'] = cleanDf['label'].apply(normalizeLabel)

In [18]:
rePredict(df, cleanDf)

Predict ke-0. Label : love. Safe Skip
Predict ke-1. Label : other. Safe Skip
Predict ke-2. Label : other. Safe Skip
Predict ke-3. Label : love. Safe Skip
Predict ke-4. Label : sad. Safe Skip
Predict ke-5. Label : hope. Safe Skip
Predict ke-6. Label : love. Safe Skip
Predict ke-7. Label : other. Safe Skip
Predict ke-8. Label : hope. Safe Skip
Predict ke-9. Label : sadness. Safe Skip
Predict ke-10. Label : other. Safe Skip
Predict ke-11. Label : other. Safe Skip
Predict ke-12. Label : sadness. Safe Skip
Predict ke-13. Label : other. Safe Skip
Predict ke-14. Label : sadness. Safe Skip
Predict ke-15. Label : hope. Safe Skip
Predict ke-16. Label : other. Safe Skip
Predict ke-17. Label : other. Safe Skip
Predict ke-18. Label : sadness. Safe Skip
Predict ke-19. Label : sadness. Safe Skip
Predict ke-20. Label : hope. Safe Skip
Predict ke-21. Label : other. Safe Skip
Predict ke-22. Label : other. Safe Skip
Predict ke-23. Label : other. Safe Skip
Predict ke-24. Label : joy. Safe Skip
Predict ke-

In [20]:
new_df = pd.read_csv('./dataset/labelled_poem_deepseek_clean.csv')

In [25]:
new_df['label'] = new_df['label'].replace('sad', 'sadness')

In [27]:
new_df.to_csv("./dataset/labelled_poem_deepseek_clean.csv", index=False)