# Basics

In [None]:
# !pip3 install pythainlp
# !pip3 install https://github.com/PyThaiNLP/thai_sentiment_analysis/archive/master.zip
# !pip3 install kenlm==0.2.0
# !pip3 install pypdf==3.17.1
# !pip3 install pytesseract==0.3.10
# !pip3 install PyMuPDF==1.23.6
# !pip3 install transformers==4.35.2

In [None]:
from pythainlp import word_tokenize, Tokenizer

text = "‡∏™‡∏°‡∏ä‡∏≤‡∏¢‡πÄ‡∏´‡πá‡∏ô‡∏ä‡∏≠‡∏ö‡∏Å‡∏•‡∏ö‡∏ó‡∏ö‡∏≤‡∏ó‡∏ô‡∏µ‡πâ"

print("newmm  :", word_tokenize(text))
print("longest:", word_tokenize(text, engine="longest"))

# Computational Linguistics

## Reverse Dictionary

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data = {
    "Word": ["‡πÅ‡∏•", "‡πÄ‡∏ö‡∏¥‡πà‡∏á", "‡∏ú‡πà‡∏≠"],
    "POS" : ["‡∏Å.", "‡∏Å.", "‡∏Å."],
    "Definition": ["‡∏î‡∏π ‡∏°‡∏≠‡∏á", "‡∏î‡∏π ‡∏°‡∏≠‡∏á ‡πÄ‡∏´‡∏•‡∏µ‡∏¢‡∏ß‡∏î‡∏π", "‡∏î‡∏π ‡∏î‡∏π‡πÅ‡∏• ‡∏°‡∏≠‡∏á"]
}

df = pd.DataFrame(data)

def calculate_cosine_similarity(definitions):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(definitions)
    cosine_sim = cosine_similarity(tfidf_matrix)
    return cosine_sim

def pos_similarity(pos_list):
    n = len(pos_list)
    total_sim = 0
    count = 0
    for i in range(n):
        for j in range(i+1, n):
            total_sim += 1 if pos_list[i] == pos_list[j] else 0
            count += 1
    return total_sim / count if count > 0 else 0

def definition_similarity(definitions):
    similarity_matrix = calculate_cosine_similarity(definitions)
    n = similarity_matrix.shape[0]
    total_sim = 0
    count = 0
    for i in range(n):
        for j in range(i+1, n):
            total_sim += similarity_matrix[i, j]
            count += 1
    return total_sim / count if count > 0 else 0

alpha = 0.5
beta = 0.5

pos = df["POS"].values
pos_sim = pos_similarity(pos)

definitions = df["Definition"]
def_sim = definition_similarity(definitions)
similarity = alpha * pos_sim + beta * def_sim

print(f"POS Similarity: {pos_sim:.2f}")
print(f"Definition Similarity: {def_sim:.2f}")
print(f"Overall Similarity: {similarity:.2f}")

## TF-IDF

In [None]:
import math
from collections import Counter
from typing import List, Dict

class TFIDFCalculator:
    def __init__(self, documents: List[List[str]]):
        self.documents = documents
        self.doc_count = len(documents)
        self.term_freq = [Counter(doc) for doc in documents]
        self.doc_lengths = [len(doc) for doc in documents]

    def calculate_tf(self, term: str, doc_idx: int) -> float:
        if self.doc_lengths[doc_idx] == 0:
            return 0
        return self.term_freq[doc_idx][term] / self.doc_lengths[doc_idx]

    def calculate_idf(self, term: str) -> float:
        doc_with_term = sum(1 for doc in self.documents if term in doc)
        if doc_with_term == 0:
            return 0
        return math.log2(self.doc_count / doc_with_term)

    def calculate_tfidf(self, terms: List[str]) -> Dict[str, List[float]]:
        results = {}
        for term in terms:
            idf = self.calculate_idf(term)
            tfidf_scores = [
                round(self.calculate_tf(term, doc_idx) * idf, 4)
                for doc_idx in range(self.doc_count)
            ]
            results[term] = tfidf_scores
        return results

def main():
    documents = [
        ["‡∏ô‡∏ß‡∏±‡∏ï‡∏Å‡∏£‡∏£‡∏°", "‡∏û‡∏•‡∏±‡∏á‡∏á‡∏≤‡∏ô", "‡∏™‡∏∞‡∏≠‡∏≤‡∏î", "‡πÄ‡∏û‡∏∑‡πà‡∏≠", "‡πÇ‡∏•‡∏Å",
        "‡∏¢‡∏±‡πà‡∏á‡∏¢‡∏∑‡∏ô", "‡∏û‡∏•‡∏±‡∏á‡∏á‡∏≤‡∏ô", "‡πÅ‡∏™‡∏á‡∏≠‡∏≤‡∏ó‡∏¥‡∏ï‡∏¢‡πå", "‡πÅ‡∏•‡∏∞", "‡∏•‡∏°",
        "‡∏Å‡∏≥‡∏•‡∏±‡∏á", "‡πÄ‡∏õ‡πá‡∏ô‡∏ó‡∏µ‡πà‡∏ô‡∏¥‡∏¢‡∏°", "‡πÉ‡∏ô", "‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡πÑ‡∏ó‡∏¢", "‡∏ô‡∏±‡∏Å‡∏ß‡∏¥‡∏ó‡∏¢‡∏≤‡∏®‡∏≤‡∏™‡∏ï‡∏£‡πå",
        "‡∏Ñ‡∏≤‡∏î‡∏ß‡πà‡∏≤", "‡∏à‡∏∞", "‡∏ä‡πà‡∏ß‡∏¢", "‡∏•‡∏î", "‡∏Å‡∏≤‡∏£‡∏õ‡∏•‡πà‡∏≠‡∏¢",
        "‡∏Å‡πä‡∏≤‡∏ã", "‡πÄ‡∏£‡∏∑‡∏≠‡∏ô‡∏Å‡∏£‡∏∞‡∏à‡∏Å", "‡πÑ‡∏î‡πâ", "‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏°‡∏≤‡∏Å"],
        ["‡πÄ‡∏®‡∏£‡∏©‡∏ê‡∏Å‡∏¥‡∏à", "‡πÑ‡∏ó‡∏¢", "‡∏ü‡∏∑‡πâ‡∏ô‡∏ï‡∏±‡∏ß", "‡∏´‡∏•‡∏±‡∏á", "‡πÇ‡∏Ñ‡∏ß‡∏¥‡∏î",
        "‡∏Å‡∏≤‡∏£‡∏ó‡πà‡∏≠‡∏á‡πÄ‡∏ó‡∏µ‡πà‡∏¢‡∏ß", "‡πÅ‡∏•‡∏∞", "‡∏Å‡∏≤‡∏£‡∏™‡πà‡∏á‡∏≠‡∏≠‡∏Å", "‡πÄ‡∏õ‡πá‡∏ô", "‡∏õ‡∏±‡∏à‡∏à‡∏±‡∏¢",
        "‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç", "‡πÉ‡∏ô", "‡∏Å‡∏≤‡∏£", "‡∏Ç‡∏±‡∏ö‡πÄ‡∏Ñ‡∏•‡∏∑‡πà‡∏≠‡∏ô", "‡πÄ‡∏®‡∏£‡∏©‡∏ê‡∏Å‡∏¥‡∏à",
        "‡∏£‡∏±‡∏ê‡∏ö‡∏≤‡∏•", "‡πÄ‡∏£‡πà‡∏á", "‡∏≠‡∏≠‡∏Å", "‡∏°‡∏≤‡∏ï‡∏£‡∏Å‡∏≤‡∏£", "‡∏Å‡∏£‡∏∞‡∏ï‡∏∏‡πâ‡∏ô"],
        ["‡∏ô‡∏ß‡∏±‡∏ï‡∏Å‡∏£‡∏£‡∏°", "‡∏õ‡∏±‡∏ç‡∏ç‡∏≤‡∏õ‡∏£‡∏∞‡∏î‡∏¥‡∏©‡∏ê‡πå", "‡πÉ‡∏ô", "‡∏ß‡∏á‡∏Å‡∏≤‡∏£", "‡πÅ‡∏û‡∏ó‡∏¢‡πå",
        "AI", "‡∏ä‡πà‡∏ß‡∏¢", "‡∏ß‡∏¥‡∏ô‡∏¥‡∏à‡∏â‡∏±‡∏¢", "‡πÇ‡∏£‡∏Ñ", "‡πÑ‡∏î‡πâ",
        "‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥", "‡∏Ç‡∏∂‡πâ‡∏ô", "‡πÇ‡∏£‡∏á‡∏û‡∏¢‡∏≤‡∏ö‡∏≤‡∏•", "‡πÉ‡∏ô", "‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡πÑ‡∏ó‡∏¢",
        "‡πÄ‡∏£‡∏¥‡πà‡∏°", "‡∏ô‡∏≥", "‡∏°‡∏≤", "‡πÉ‡∏ä‡πâ"],
        ["‡∏Å‡∏≤‡∏£‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡πÅ‡∏õ‡∏•‡∏á", "‡∏™‡∏†‡∏≤‡∏û", "‡∏†‡∏π‡∏°‡∏¥‡∏≠‡∏≤‡∏Å‡∏≤‡∏®", "‡∏Å‡∏£‡∏∞‡∏ó‡∏ö", "‡∏†‡∏≤‡∏Ñ",
        "‡πÄ‡∏Å‡∏©‡∏ï‡∏£", "‡πÄ‡∏Å‡∏©‡∏ï‡∏£‡∏Å‡∏£", "‡πÑ‡∏ó‡∏¢", "‡∏õ‡∏£‡∏±‡∏ö‡∏ï‡∏±‡∏ß", "‡∏£‡∏±‡∏ö‡∏°‡∏∑‡∏≠",
        "‡∏†‡∏±‡∏¢‡πÅ‡∏•‡πâ‡∏á", "‡πÅ‡∏•‡∏∞", "‡∏ô‡πâ‡∏≥‡∏ó‡πà‡∏ß‡∏°", "‡∏ô‡∏±‡∏Å‡∏ß‡∏¥‡∏ó‡∏¢‡∏≤‡∏®‡∏≤‡∏™‡∏ï‡∏£‡πå", "‡πÄ‡∏£‡πà‡∏á",
         "‡∏Ñ‡∏¥‡∏î‡∏Ñ‡πâ‡∏ô", "‡∏û‡∏±‡∏ô‡∏ò‡∏∏‡πå‡∏û‡∏∑‡∏ä", "‡∏ó‡∏ô‡∏ó‡∏≤‡∏ô"],
        ["‡∏û‡∏•‡∏±‡∏á‡∏á‡∏≤‡∏ô", "‡∏ô‡∏¥‡∏ß‡πÄ‡∏Ñ‡∏•‡∏µ‡∏¢‡∏£‡πå", "‡∏ó‡∏≤‡∏á‡πÄ‡∏•‡∏∑‡∏≠‡∏Å", "‡∏´‡∏£‡∏∑‡∏≠", "‡∏ó‡∏≤‡∏á‡∏ï‡∏±‡∏ô",
        "‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡πÑ‡∏ó‡∏¢", "‡∏¢‡∏±‡∏á", "‡∏•‡∏±‡∏á‡πÄ‡∏•", "‡πÉ‡∏ô", "‡∏Å‡∏≤‡∏£‡∏û‡∏±‡∏í‡∏ô‡∏≤",
        "‡πÇ‡∏£‡∏á‡πÑ‡∏ü‡∏ü‡πâ‡∏≤", "‡∏ô‡∏¥‡∏ß‡πÄ‡∏Ñ‡∏•‡∏µ‡∏¢‡∏£‡πå", "‡∏Ç‡∏ì‡∏∞‡∏ó‡∏µ‡πà", "‡∏´‡∏•‡∏≤‡∏¢", "‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®",
        "‡πÄ‡∏î‡∏¥‡∏ô‡∏´‡∏ô‡πâ‡∏≤", "‡πÄ‡∏ï‡πá‡∏°‡∏ó‡∏µ"],
        ["‡∏Å‡∏≤‡∏£‡∏û‡∏±‡∏í‡∏ô‡∏≤", "‡πÄ‡∏°‡∏∑‡∏≠‡∏á", "‡∏≠‡∏±‡∏à‡∏â‡∏£‡∏¥‡∏¢‡∏∞", "‡πÉ‡∏ô", "‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡πÑ‡∏ó‡∏¢",
        "‡∏Å‡∏£‡∏∏‡∏á‡πÄ‡∏ó‡∏û‡∏Ø", "‡πÅ‡∏•‡∏∞", "‡πÄ‡∏°‡∏∑‡∏≠‡∏á", "‡πÉ‡∏´‡∏°‡πà", "‡πÄ‡∏£‡πà‡∏á",
        "‡∏õ‡∏£‡∏±‡∏ö‡∏ï‡∏±‡∏ß", "‡∏™‡∏π‡πà", "Smart City", "‡πÉ‡∏ä‡πâ", "‡πÄ‡∏ó‡∏Ñ‡πÇ‡∏ô‡πÇ‡∏•‡∏¢‡∏µ",
        "IoT", "‡πÄ‡∏û‡∏∑‡πà‡∏≠", "‡∏¢‡∏Å‡∏£‡∏∞‡∏î‡∏±‡∏ö", "‡∏Ñ‡∏∏‡∏ì‡∏†‡∏≤‡∏û", "‡∏ä‡∏µ‡∏ß‡∏¥‡∏ï"],
        ["‡∏ß‡∏¥‡∏Å‡∏§‡∏ï", "‡∏Ç‡∏¢‡∏∞", "‡∏û‡∏•‡∏≤‡∏™‡∏ï‡∏¥‡∏Å", "‡πÉ‡∏ô", "‡∏ó‡∏∞‡πÄ‡∏•‡πÑ‡∏ó‡∏¢",
        "‡∏ô‡∏±‡∏Å‡∏ß‡∏¥‡∏ó‡∏¢‡∏≤‡∏®‡∏≤‡∏™‡∏ï‡∏£‡πå", "‡πÄ‡∏ï‡∏∑‡∏≠‡∏ô", "‡∏ú‡∏•‡∏Å‡∏£‡∏∞‡∏ó‡∏ö", "‡∏ï‡πà‡∏≠", "‡∏£‡∏∞‡∏ö‡∏ö‡∏ô‡∏¥‡πÄ‡∏ß‡∏®",
        "‡∏£‡∏±‡∏ê‡∏ö‡∏≤‡∏•", "‡∏≠‡∏≠‡∏Å", "‡∏°‡∏≤‡∏ï‡∏£‡∏Å‡∏≤‡∏£", "‡∏•‡∏î", "‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ",
        "‡∏û‡∏•‡∏≤‡∏™‡∏ï‡∏¥‡∏Å"],
        ["5G", "‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô", "‡πÇ‡∏â‡∏°", "‡∏≠‡∏∏‡∏ï‡∏™‡∏≤‡∏´‡∏Å‡∏£‡∏£‡∏°", "‡πÑ‡∏ó‡∏¢",
        "‡∏ú‡∏π‡πâ‡∏õ‡∏£‡∏∞‡∏Å‡∏≠‡∏ö‡∏Å‡∏≤‡∏£", "‡πÄ‡∏£‡πà‡∏á", "‡∏õ‡∏£‡∏±‡∏ö‡∏ï‡∏±‡∏ß", "‡∏£‡∏±‡∏ö", "‡πÄ‡∏ó‡∏Ñ‡πÇ‡∏ô‡πÇ‡∏•‡∏¢‡∏µ",
        "‡πÉ‡∏´‡∏°‡πà",  "‡∏Ñ‡∏≤‡∏î", "‡∏ä‡πà‡∏ß‡∏¢", "‡πÄ‡∏û‡∏¥‡πà‡∏°", "‡∏õ‡∏£‡∏∞‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡∏†‡∏≤‡∏û",
        "‡∏Å‡∏≤‡∏£‡∏ú‡∏•‡∏¥‡∏ï"],
        ["‡∏Å‡∏≤‡∏£‡∏ó‡πà‡∏≠‡∏á‡πÄ‡∏ó‡∏µ‡πà‡∏¢‡∏ß", "‡πÄ‡∏ä‡∏¥‡∏á‡∏ô‡∏¥‡πÄ‡∏ß‡∏®", "‡∏ö‡∏π‡∏°", "‡πÉ‡∏ô", "‡πÑ‡∏ó‡∏¢",
        "‡∏ô‡∏±‡∏Å‡∏ó‡πà‡∏≠‡∏á‡πÄ‡∏ó‡∏µ‡πà‡∏¢‡∏ß", "‡∏ï‡πà‡∏≤‡∏á‡∏ä‡∏≤‡∏ï‡∏¥", "‡∏™‡∏ô‡πÉ‡∏à", "‡∏ò‡∏£‡∏£‡∏°‡∏ä‡∏≤‡∏ï‡∏¥", "‡πÅ‡∏•‡∏∞",
        "‡∏ß‡∏±‡∏í‡∏ô‡∏ò‡∏£‡∏£‡∏°", "‡∏ó‡πà‡∏≠‡∏á‡∏ñ‡∏¥‡πà‡∏ô", "‡∏ä‡πà‡∏ß‡∏¢", "‡∏Å‡∏£‡∏∞‡∏à‡∏≤‡∏¢", "‡∏£‡∏≤‡∏¢‡πÑ‡∏î‡πâ",
        "‡∏™‡∏π‡πà", "‡∏ä‡∏∏‡∏°‡∏ä‡∏ô"],
        ["‡∏û‡∏•‡∏±‡∏á‡∏á‡∏≤‡∏ô", "‡∏™‡∏∞‡∏≠‡∏≤‡∏î", "‡∏Å‡∏±‡∏ö", "‡∏Å‡∏≤‡∏£‡∏û‡∏±‡∏í‡∏ô‡∏≤", "‡∏ó‡∏µ‡πà",
        "‡∏¢‡∏±‡πà‡∏á‡∏¢‡∏∑‡∏ô", "‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡πÑ‡∏ó‡∏¢", "‡∏ï‡∏±‡πâ‡∏á‡πÄ‡∏õ‡πâ‡∏≤", "‡πÄ‡∏û‡∏¥‡πà‡∏°", "‡∏™‡∏±‡∏î‡∏™‡πà‡∏ß‡∏ô",
        "‡∏û‡∏•‡∏±‡∏á‡∏á‡∏≤‡∏ô", "‡∏´‡∏°‡∏∏‡∏ô‡πÄ‡∏ß‡∏µ‡∏¢‡∏ô", "‡∏ô‡∏±‡∏Å‡∏•‡∏á‡∏ó‡∏∏‡∏ô", "‡∏™‡∏ô‡πÉ‡∏à","‡∏•‡∏á‡∏ó‡∏∏‡∏ô",
        "‡πÉ‡∏ô", "‡πÇ‡∏Ñ‡∏£‡∏á‡∏Å‡∏≤‡∏£", "‡∏û‡∏•‡∏±‡∏á‡∏á‡∏≤‡∏ô", "‡πÅ‡∏™‡∏á‡∏≠‡∏≤‡∏ó‡∏¥‡∏ï‡∏¢‡πå", "‡πÅ‡∏•‡∏∞",
        "‡∏•‡∏°"]
    ]
    target_words = ["‡∏û‡∏•‡∏±‡∏á‡∏á‡∏≤‡∏ô", "‡∏ô‡∏ß‡∏±‡∏ï‡∏Å‡∏£‡∏£‡∏°", "‡πÄ‡∏®‡∏£‡∏©‡∏ê‡∏Å‡∏¥‡∏à", "‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡πÑ‡∏ó‡∏¢", "‡πÄ‡∏ó‡∏Ñ‡πÇ‡∏ô‡πÇ‡∏•‡∏¢‡∏µ"]
    calculator = TFIDFCalculator(documents)
    results = calculator.calculate_tfidf(target_words)

    for term, scores in results.items():
        print(f"\n‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤ '{term}':")
        for doc_idx, score in enumerate(scores, 1):
            if score > 0:
                print(f"D{doc_idx}: {score:.4f}")

if __name__ == "__main__":
    main()

## Cosine Similarity

In [None]:
import numpy as np
from numpy.linalg import norm

A = np.array([
  2, 1, 2, 3, 2, 9
  ])

B = np.array([
  3, 4, 2, 4, 5, 5
  ])

print("A:", A)
print("B:", B)

cosine = np.dot(A,B)/(norm(A)*norm(B))
print(f"Cosine Similarity: {cosine:.4f}")

## Word Embedding

In [None]:
# cosine similarity and distance

import pandas as pd
import numpy as np

rows = ["‡∏ô‡πâ‡∏≥", "‡∏Ç‡πâ‡∏≤‡∏ß", "‡∏ú‡∏•‡πÑ‡∏°‡πâ", "‡∏à‡∏≤‡∏ô", "‡πÅ‡∏Å‡πâ‡∏ß", "‡πÄ‡∏ô‡∏∑‡πâ‡∏≠", "‡∏õ‡∏•‡∏≤", "‡∏ú‡∏±‡∏Å"]
columns = ["‡∏Å‡∏¥‡∏ô", "‡∏î‡∏∑‡πà‡∏°", "‡∏ã‡∏∑‡πâ‡∏≠", "‡∏•‡πâ‡∏≤‡∏á", "‡πÄ‡∏Å‡πá‡∏ö", "‡∏õ‡∏£‡∏∏‡∏á", "‡∏´‡∏±‡πà‡∏ô", "‡πÅ‡∏ä‡πà", "‡∏Ç‡∏ß‡∏î",
           "‡∏ñ‡∏≤‡∏î", "‡∏ä‡∏≤‡∏°", "‡∏ñ‡πâ‡∏ß‡∏¢", "‡∏ä‡∏¥‡πâ‡∏ô", "‡∏ú‡∏•", "‡πÉ‡∏ô", "‡∏ö‡∏ô", "‡∏Å‡∏±‡∏ö", "‡πÅ‡∏•‡∏∞"]

data = [[25, 95, 42, 38, 12, 0, 0, 85, 90, 0, 0, 65, 0, 0, 75, 0, 85, 45],
        [82, 0, 35, 0, 45, 58, 0, 0, 0, 85, 90, 75, 0, 0, 65, 0, 78, 55],
        [68, 52, 73, 45, 38, 0, 75, 65, 0, 25, 0, 0, 85, 95, 45, 0, 65, 85],
        [0, 0, 28, 92, 85, 0, 0, 0, 0, 0, 72, 0, 0, 0, 0, 95, 45, 55],
        [0, 88, 32, 75, 62, 0, 0, 0, 87, 0, 0, 0, 0, 0, 0, 85, 35, 45],
        [81, 0, 60, 68, 56, 72, 85, 55, 0, 75, 65, 0, 95, 0, 45, 75, 85, 65],
        [85, 0, 65, 72, 48, 78, 82, 62, 0, 78, 68, 0, 92, 0, 52, 72, 88, 58],
        [75, 0, 70, 85, 52, 65, 88, 45, 0, 65, 55, 0, 0, 0, 35, 65, 92, 75]]

df = pd.DataFrame(data, index=rows, columns=columns)

def cosine_calculation(word_pair, mode = "similarity"):
    vec1 = df.loc[word_pair[0]].values
    vec2 = df.loc[word_pair[1]].values
    dot_product = np.dot(vec1, vec2)
    magnitude1 = np.linalg.norm(vec1)
    magnitude2 = np.linalg.norm(vec2)
    cosine_similarity = dot_product / (magnitude1 * magnitude2)
    result = cosine_similarity if mode == "similarity" else 1 - cosine_similarity

    return f"cosine {mode} ‡∏£‡∏∞‡∏´‡∏ß‡πà‡∏≤‡∏á '{word_pair[0]}' ‡πÅ‡∏•‡∏∞ '{word_pair[1]}' = {result:.4f}"

# print(cosine_calculation(("‡∏à‡∏≤‡∏ô", "‡∏õ‡∏•‡∏≤")))
# print(cosine_calculation(("‡πÄ‡∏ô‡∏∑‡πâ‡∏≠", "‡∏ô‡πâ‡∏≥")))
# print(cosine_calculation(("‡∏ú‡∏±‡∏Å", "‡∏Ç‡πâ‡∏≤‡∏ß")))
# print(cosine_calculation(("‡∏ú‡∏•‡πÑ‡∏°‡πâ", "‡πÅ‡∏Å‡πâ‡∏ß"), mode = "distance"))
# print(cosine_calculation(("‡∏Ç‡πâ‡∏≤‡∏ß", "‡πÅ‡∏Å‡πâ‡∏ß"), mode = "distance"))
# print(cosine_calculation(("‡∏õ‡∏•‡∏≤", "‡πÅ‡∏Å‡πâ‡∏ß"), mode = "distance"))

In [None]:
# ‡∏Å‡∏≤‡∏£‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏±‡∏°‡∏û‡∏±‡∏ô‡∏ò‡πå‡∏£‡∏∞‡∏´‡∏ß‡πà‡∏≤‡∏á‡∏Ñ‡∏≥‡∏ô‡∏≤‡∏°‡∏Å‡∏±‡∏ö‡∏•‡∏±‡∏Å‡∏©‡∏ì‡∏ô‡∏≤‡∏°

classifiers = ["‡∏Ç‡∏ß‡∏î", "‡∏ñ‡∏≤‡∏î", "‡∏ä‡∏≤‡∏°", "‡∏ñ‡πâ‡∏ß‡∏¢", "‡∏ä‡∏¥‡πâ‡∏ô", "‡∏ú‡∏•"]
nouns = df.index

classifier_df = df[classifiers]

for classifier in classifiers:
    relation = classifier_df[classifier].max()
    nouns = classifier_df[classifier_df[classifier] == relation].index
    print(f"‡∏•‡∏±‡∏Å‡∏©‡∏ì‡∏ô‡∏≤‡∏° '{classifier}' ‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏±‡∏°‡∏û‡∏±‡∏ô‡∏ò‡πå‡∏Å‡∏±‡∏ö \"{','.join(nouns)}\" ‡∏™‡∏π‡∏á‡∏™‡∏∏‡∏î - {relation}")

In [None]:
# ‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ "‡πÉ‡∏ô" ‡πÅ‡∏•‡∏∞ "‡∏ö‡∏ô"

prepositions = ["‡πÉ‡∏ô", "‡∏ö‡∏ô"]
for prep in prepositions:
    sorted_values = df[prep].sort_values(ascending=False)
    print(f"\n‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ '{prep}':")
    for noun, value in sorted_values.items():
        if value > 0:
            print(f"{noun}: {value}")

In [None]:
# ‡∏Å‡∏≤‡∏£‡∏à‡∏±‡∏î‡∏Å‡∏•‡∏∏‡πà‡∏°‡∏Ñ‡∏≥‡∏ô‡∏≤‡∏°‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏±‡∏°‡∏û‡∏±‡∏ô‡∏ò‡πå‡πÉ‡∏Å‡∏•‡πâ‡πÄ‡∏Ñ‡∏µ‡∏¢‡∏á

from itertools import combinations

# ‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì cosine similarity
def mini_cosine(word1, word2):
    vec1 = df.loc[word1].values
    vec2 = df.loc[word2].values
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# ‡∏´‡∏≤‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏±‡∏°‡∏û‡∏±‡∏ô‡∏ò‡πå‡∏£‡∏∞‡∏´‡∏ß‡πà‡∏≤‡∏á‡∏ó‡∏∏‡∏Å‡∏Ñ‡∏π‡πà‡∏Ñ‡∏≥
word_pairs = list(combinations(df.index, 2))
similarities = [(pair[0], pair[1], mini_cosine(pair[0], pair[1])) 
               for pair in word_pairs]

# ‡πÄ‡∏£‡∏µ‡∏¢‡∏á‡∏•‡∏≥‡∏î‡∏±‡∏ö‡πÅ‡∏•‡∏∞‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•
similarities.sort(key=lambda x: x[2], reverse=True)
print("‡∏Ñ‡∏π‡πà‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏±‡∏°‡∏û‡∏±‡∏ô‡∏ò‡πå‡πÉ‡∏Å‡∏•‡πâ‡πÄ‡∏Ñ‡∏µ‡∏¢‡∏á‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î:")
for pair in similarities[:5]:
    print(f"{pair[0]} - {pair[1]}: {pair[2]:.4f}")

In [None]:
# ‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ‡∏Å‡∏£‡∏¥‡∏¢‡∏≤ "‡∏Å‡∏¥‡∏ô ‡∏î‡∏∑‡πà‡∏° ‡∏õ‡∏£‡∏∏‡∏á ‡∏´‡∏±‡πà‡∏ô"

verbs = ["‡∏Å‡∏¥‡∏ô", "‡∏î‡∏∑‡πà‡∏°", "‡∏õ‡∏£‡∏∏‡∏á", "‡∏´‡∏±‡πà‡∏ô"]
verb_df = df[verbs]
distances = []

for noun in df.index:
    current_vec = verb_df.loc[noun].values

    other_nouns = [n for n in df.index if n != noun]
    other_vectors = verb_df.loc[other_nouns].values
    mean_vec = np.mean(other_vectors, axis=0)
    
    distance = 1 - np.dot(current_vec, mean_vec) / (np.linalg.norm(current_vec) * np.linalg.norm(mean_vec))
    distances.append((noun, distance))

distances.sort(key=lambda x: x[1], reverse=True)

print("\n‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏ï‡∏Å‡∏ï‡πà‡∏≤‡∏á‡∏Ç‡∏≠‡∏á‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ‡∏Å‡∏£‡∏¥‡∏¢‡∏≤ (‡πÄ‡∏£‡∏µ‡∏¢‡∏á‡∏à‡∏≤‡∏Å‡∏°‡∏≤‡∏Å‡πÑ‡∏õ‡∏ô‡πâ‡∏≠‡∏¢):")
for noun, dist in distances:
    print(f"{noun}: {dist:.4f}")
    pattern = verb_df.loc[noun]
    # print(f"Pattern: ‡∏Å‡∏¥‡∏ô = {pattern['‡∏Å‡∏¥‡∏ô']}, ‡∏î‡∏∑‡πà‡∏° = {pattern['‡∏î‡∏∑‡πà‡∏°']}, ‡∏õ‡∏£‡∏∏‡∏á = {pattern['‡∏õ‡∏£‡∏∏‡∏á']}, ‡∏´‡∏±‡πà‡∏ô = {pattern['‡∏´‡∏±‡πà‡∏ô']}")

In [None]:
# ‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏Ñ "‡∏Ñ‡∏∏‡∏ì‡πÅ‡∏°‡πà_‡∏ô‡πâ‡∏≥‡πÉ‡∏™‡πà‡πÅ‡∏Å‡πâ‡∏ß"

target_verbs = df.columns[:8]  # ['‡∏Å‡∏¥‡∏ô', '‡∏î‡∏∑‡πà‡∏°', '‡∏ã‡∏∑‡πâ‡∏≠', '‡∏•‡πâ‡∏≤‡∏á', '‡πÄ‡∏Å‡πá‡∏ö', '‡∏õ‡∏£‡∏∏‡∏á', '‡∏´‡∏±‡πà‡∏ô', '‡πÅ‡∏ä‡πà']
scores = []

for verb in target_verbs:
    # ‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì score ‡πÇ‡∏î‡∏¢‡∏Ñ‡∏π‡∏ì‡∏Ñ‡πà‡∏≤‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏±‡∏°‡∏û‡∏±‡∏ô‡∏ò‡πå‡∏Ç‡∏≠‡∏á‡∏ó‡∏±‡πâ‡∏á‡∏™‡∏≠‡∏á‡∏Ñ‡∏≥
    score = df.loc["‡∏ô‡πâ‡∏≥", verb] * df.loc["‡πÅ‡∏Å‡πâ‡∏ß", verb]
    scores.append((verb, score))

# ‡πÄ‡∏£‡∏µ‡∏¢‡∏á‡∏•‡∏≥‡∏î‡∏±‡∏ö‡∏à‡∏≤‡∏Å‡∏°‡∏≤‡∏Å‡πÑ‡∏õ‡∏ô‡πâ‡∏≠‡∏¢
scores.sort(key=lambda x: x[1], reverse=True)
print("‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏õ‡πá‡∏ô‡πÑ‡∏õ‡πÑ‡∏î‡πâ‡∏Ç‡∏≠‡∏á‡πÅ‡∏ï‡πà‡∏•‡∏∞‡∏Å‡∏£‡∏¥‡∏¢‡∏≤:")
for verb, score in scores:
    if score > 0:  # ‡πÅ‡∏™‡∏î‡∏á‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏Å‡∏£‡∏¥‡∏¢‡∏≤‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏õ‡πá‡∏ô‡πÑ‡∏õ‡πÑ‡∏î‡πâ (score > 0)
        print(f"{verb}: {score}")

In [None]:
# ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏õ‡πá‡∏ô‡πÑ‡∏õ‡πÑ‡∏î‡πâ‡∏Ç‡∏≠‡∏á‡∏•‡∏±‡∏Å‡∏©‡∏ì‡∏ô‡∏≤‡∏°‡∏Å‡∏±‡∏ö "‡πÄ‡∏ô‡∏∑‡πâ‡∏≠"

classifiers = ["‡∏ä‡∏¥‡πâ‡∏ô", "‡∏ñ‡∏≤‡∏î", "‡∏ä‡∏≤‡∏°"] # ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏•‡∏±‡∏Å‡∏©‡∏ì‡∏ô‡∏≤‡∏°‡∏ó‡∏µ‡πà‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏Ç‡πâ‡∏≠‡∏á

# ‡πÅ‡∏™‡∏î‡∏á‡∏Ñ‡πà‡∏≤‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏±‡∏°‡∏û‡∏±‡∏ô‡∏ò‡πå‡∏Å‡πà‡∏≠‡∏ô‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ô‡πà‡∏≤‡∏à‡∏∞‡πÄ‡∏õ‡πá‡∏ô
print("‡∏Ñ‡πà‡∏≤‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏±‡∏°‡∏û‡∏±‡∏ô‡∏ò‡πå‡∏£‡∏∞‡∏´‡∏ß‡πà‡∏≤‡∏á‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤ '‡πÄ‡∏ô‡∏∑‡πâ‡∏≠' ‡∏Å‡∏±‡∏ö‡∏•‡∏±‡∏Å‡∏©‡∏ì‡∏ô‡∏≤‡∏°‡∏ï‡πà‡∏≤‡∏á‡πÜ:")
for clf in classifiers:
    print(f"{clf}: {df.loc['‡πÄ‡∏ô‡∏∑‡πâ‡∏≠', clf]}")

# ‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì‡∏ú‡∏•‡∏£‡∏ß‡∏°‡∏Ç‡∏≠‡∏á‡∏Ñ‡πà‡∏≤‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏±‡∏°‡∏û‡∏±‡∏ô‡∏ò‡πå‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î
total = sum(df.loc["‡πÄ‡∏ô‡∏∑‡πâ‡∏≠", classifiers])
print(f"‡∏ú‡∏•‡∏£‡∏ß‡∏°‡∏Ñ‡πà‡∏≤‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏±‡∏°‡∏û‡∏±‡∏ô‡∏ò‡πå: {total}")

classifier_probs = []
for clf in classifiers:
    # ‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ô‡πà‡∏≤‡∏à‡∏∞‡πÄ‡∏õ‡πá‡∏ô‡πÇ‡∏î‡∏¢‡∏´‡∏≤‡∏£‡∏î‡πâ‡∏ß‡∏¢‡∏ú‡∏•‡∏£‡∏ß‡∏°
    prob = df.loc["‡πÄ‡∏ô‡∏∑‡πâ‡∏≠", clf] / total
    classifier_probs.append((clf, prob))

# ‡πÄ‡∏£‡∏µ‡∏¢‡∏á‡∏•‡∏≥‡∏î‡∏±‡∏ö‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ô‡πà‡∏≤‡∏à‡∏∞‡πÄ‡∏õ‡πá‡∏ô‡∏à‡∏≤‡∏Å‡∏°‡∏≤‡∏Å‡πÑ‡∏õ‡∏ô‡πâ‡∏≠‡∏¢
classifier_probs.sort(key=lambda x: x[1], reverse=True)
print("\n‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ô‡πà‡∏≤‡∏à‡∏∞‡πÄ‡∏õ‡πá‡∏ô‡∏Ç‡∏≠‡∏á‡πÅ‡∏ï‡πà‡∏•‡∏∞‡∏•‡∏±‡∏Å‡∏©‡∏ì‡∏ô‡∏≤‡∏°:")
for clf, prob in classifier_probs:
    print(f"{clf}: {prob:.4f} ({df.loc['‡πÄ‡∏ô‡∏∑‡πâ‡∏≠', clf]}/{total})")

## Sentiment Analysis

### Project: Language in Digital Media
**LG468 Language in Digital Media**

In [1]:
import requests
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pythainlp.corpus.common import thai_stopwords
from wordcloud import WordCloud, STOPWORDS

In [2]:
# API Configuration

API_KEY = 'kHIllIH4ODKsOvvi7QJINN5FIzf6sFgR'
API_FOR_THAI = "https://api.aiforthai.in.th"
SSSENSE_ENDPOINT = f"{API_FOR_THAI}/ssense"
TEXT_CLEANSING_ENDPOINT = f"{API_FOR_THAI}/textcleansing"

HEADERS = {"apikey": API_KEY}

In [None]:
# Data Loading and Preprocessing

def load_data(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = file.read().splitlines()
            return data
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return []

def cleanse_data(data):
    cleaned_data = []
    for text in data:
        response = requests.post(TEXT_CLEANSING_ENDPOINT, data={'text': text}, headers=HEADERS)
        cleaned_data.append(response.json()['cleansing_text'])
    return cleaned_data

data = load_data(r'datasets\test.csv')

cleaned_data = cleanse_data(data)

['‡∏≠‡πâ‡∏≤‡∏ß ‡∏≠‡∏¢‡∏≤‡∏Å‡∏°‡∏µ‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏´‡∏£‡∏≠‡∏ß‡∏∞ ‡πÑ‡∏≠‡πâ‡∏´‡∏±‡∏ß‡πÄ‡∏Å‡∏£‡∏µ‡∏¢‡∏ô!', '‡∏Ç‡∏≠‡πÇ‡∏ó‡∏©‡∏ó‡∏µ‡πà‡∏£‡∏ö‡∏Å‡∏ß‡∏ô‡∏Ñ‡∏∞', '‡∏£‡∏±‡∏Å‡πÄ‡∏ò‡∏≠‡∏ô‡∏∞ ‡∏Ñ‡∏ô‡∏î‡∏µ', '‡πÄ‡∏´‡∏µ‡πâ‡∏¢ ‡∏™‡∏±‡∏ï‡∏ß‡πå‡∏°‡∏∂‡∏á‡∏≠‡∏¢‡πà‡∏≤‡πÄ‡∏™‡∏∑‡∏≠‡∏Å‡∏î‡∏¥', '‡πÑ‡∏≠‡πâ‡∏Ñ‡∏ô‡∏ó‡∏£‡∏¢‡∏® ‡πÅ‡∏Å‡∏≠‡∏¢‡πà‡∏≤‡∏´‡∏ß‡∏±‡∏á‡∏ß‡πà‡∏≤‡∏à‡∏∞‡∏ï‡∏≤‡∏¢‡∏î‡∏µ', '‡∏î‡∏µ‡πÉ‡∏à‡∏î‡πâ‡∏ß‡∏¢‡∏ô‡∏∞ ‡∏Ñ‡∏∏‡∏ì‡∏´‡∏ç‡∏¥‡∏á‡∏Å‡∏§‡∏ï‡∏¢‡∏≤', '‡πÄ‡∏ò‡∏≠‡πÄ‡∏õ‡πá‡∏ô‡∏Ñ‡∏ß‡∏≤‡∏°‡∏†‡∏π‡∏°‡∏¥‡πÉ‡∏à‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î‡∏Ç‡∏≠‡∏á‡∏â‡∏±‡∏ô', '‡∏ó‡∏õ‡∏≠. ‡∏ß‡πà‡∏≤‡πÅ‡∏ï‡πà‡πÄ‡∏Ç‡∏≤‡∏≠‡∏¥‡πÄ‡∏´‡∏ô‡∏≤‡πÄ‡∏õ‡πá‡∏ô‡πÄ‡∏≠‡∏á ‡∏Ç‡∏µ‡πâ‡∏Ñ‡∏∏‡∏¢‡∏ä‡∏°‡∏±‡∏î', '‡∏ï‡∏¥‡∏î‡∏°‡∏´‡∏¥‡∏î‡∏• ‡∏£‡∏≠‡∏ö‡∏û‡∏≠‡∏£‡πå‡∏ï‡πÅ‡∏•‡πâ‡∏ß ‡πÄ‡∏¢‡πâ‡πÜ‡πÜ', '‡∏°‡∏∑‡∏≠‡∏ñ‡∏∑‡∏≠‡∏Ñ‡πâ‡∏≤‡∏á‡∏≠‡πà‡∏∞ ‡πÄ‡∏ã‡πá‡∏á‡∏à‡∏±‡∏á', '‡πÄ‡∏≠‡∏≤‡∏´‡∏°‡∏π‡∏Ñ‡∏∏‡πÇ‡∏£‡∏ö‡∏∏‡∏ï‡∏∞‡∏°‡∏≤‡∏ó‡∏≥‡∏™‡πÄ‡∏ï‡πä‡∏Å‡∏ô‡∏µ‡πà ‡∏Å‡∏¥‡∏ô‡πÅ‡∏•‡πâ‡∏ß‡∏ü‡∏¥‡∏ô‡∏Ç‡∏±‡πâ‡∏ô‡πÄ‡∏ó‡∏û‡∏û‡∏û‡∏û', '‡πÄ‡∏Å‡∏£‡∏ã‡πÑ‡∏°‡πà‡πÄ‡∏Ñ‡∏¢‡πÄ‡∏õ‡∏¥‡∏î‡πÉ‡∏à‡πÉ‡∏´‡πâ‡πÄ‡∏£‡∏≤ ‡πÑ

In [None]:
# Sentiment Analysis

def analyze_sentiment(data):
    text = []
    polarity = []
    confidence = []
    keywords = []
    poswords = []
    negwords = []

    for text_data in data:
        response = requests.post(SSSENSE_ENDPOINT, data={'text': text_data}, headers=HEADERS)
        if response.json()['sentiment']['score'] > '50':
            text.append(response.json()['preprocess']['input'])
            polarity.append(response.json()['sentiment']['polarity'])
            confidence.append(float(response.json()['sentiment']['score']))
            keywords.extend(response.json()['preprocess']['keyword'])
            if response.json()['preprocess']['pos']:
                poswords.extend(response.json()['preprocess']['pos'])
            if response.json()['preprocess']['neg']:
                negwords.extend(response.json()['preprocess']['neg'])

    return text, polarity, confidence, keywords, poswords, negwords

text, polarity, confidence, keywords, poswords, negwords = analyze_sentiment(cleaned_data)

In [None]:
# Data Processing and Output

def process_data(text, polarity, confidence):
    confidence_lst = list(zip(polarity, confidence))
    predicted_lst = list(zip(text, polarity))
    return confidence_lst, predicted_lst

confidence_lst, predicted_lst = process_data(text, polarity, confidence)

print(confidence_lst)
print(predicted_lst)

In [None]:
df = pd.DataFrame(confidence_lst, columns=['Sentiment', 'Confidence'])

sns.set_theme(style="whitegrid")

plt.figure(figsize=(10, 5))
sns.boxplot(x='Sentiment', y='Confidence', data=df)
plt.title("Confidence Scores by Sentiment")
plt.xlabel("Sentiment")
plt.ylabel("Confidence (%)")
plt.show()

In [None]:
df = pd.DataFrame(confidence_lst, columns=['Sentiment', 'Confidence'])

bins = np.linspace(50, 100, 10)

df['Confidence_Range'] = pd.cut(df['Confidence'], bins=bins, include_lowest=True)

pivot_df = df.pivot_table(values='Confidence', index='Confidence_Range', 
                          columns='Sentiment', aggfunc='count', fill_value=0)

pivot_df = pivot_df.sort_index(ascending=False)

plt.figure(figsize=(10, 8))
sns.heatmap(pivot_df, annot=False, cmap='YlOrRd', cbar_kws={'label': 'Count'})
plt.title("Confidence Scores by Sentiment")
plt.xlabel("Sentiment")
plt.ylabel("Confidence Score Ranges")
plt.tight_layout()
plt.show()

In [None]:
# Plotting Word Clouds

text_neg = " ".join(text for text, sentiment in predicted_lst if sentiment == 'negative')
text_pos = " ".join(text for text, sentiment in predicted_lst if sentiment == 'positive')

fp = 'THSarabunNew.ttf'
reg = r"[‡∏Å-‡πôa-zA-Z']+"
thai_stopwords = list(thai_stopwords())

wordcloud_neg = WordCloud(stopwords=thai_stopwords, background_color='white', max_words=2000,
                          height=2000, width=4000, font_path=fp, regexp=reg).generate(text_neg)

wordcloud_pos = WordCloud(stopwords=thai_stopwords, background_color='white', max_words=2000,
                          height=2000, width=4000, font_path=fp, regexp=reg).generate(text_pos)

fig, axs = plt.subplots(1, 2, figsize=(16, 8))

axs[0].imshow(wordcloud_neg, interpolation='bilinear')
axs[0].axis('off')
axs[0].set_title('Negative Sentiment')

axs[1].imshow(wordcloud_pos, interpolation='bilinear')
axs[1].axis('off')
axs[1].set_title('Positive Sentiment')

plt.show()