# Basics

In [None]:
# !pip3 install pythainlp
# !pip3 install https://github.com/PyThaiNLP/thai_sentiment_analysis/archive/master.zip

!pip3 install kenlm==0.2.0
!pip3 install pypdf==3.17.1
!pip3 install pytesseract==0.3.10
!pip3 install PyMuPDF==1.23.6
!pip3 install transformers==4.35.2

In [None]:
from pythainlp import word_tokenize, Tokenizer

text = "สมชายเห็นชอบกลบทบาทนี้"

print("newmm  :", word_tokenize(text))
print("longest:", word_tokenize(text, engine="longest"))

# Computational Linguistics

## Reverse Dictionary

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data = {
    "Word": ["แล", "เบิ่ง", "ผ่อ"],
    "POS" : ["ก.", "ก.", "ก."],
    "Definition": ["ดู มอง", "ดู มอง เหลียวดู", "ดู ดูแล มอง"]
}

df = pd.DataFrame(data)

def calculate_cosine_similarity(definitions):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(definitions)
    cosine_sim = cosine_similarity(tfidf_matrix)
    return cosine_sim

def pos_similarity(pos_list):
    n = len(pos_list)
    total_sim = 0
    count = 0
    for i in range(n):
        for j in range(i+1, n):
            total_sim += 1 if pos_list[i] == pos_list[j] else 0
            count += 1
    return total_sim / count if count > 0 else 0

def definition_similarity(definitions):
    similarity_matrix = calculate_cosine_similarity(definitions)
    n = similarity_matrix.shape[0]
    total_sim = 0
    count = 0
    for i in range(n):
        for j in range(i+1, n):
            total_sim += similarity_matrix[i, j]
            count += 1
    return total_sim / count if count > 0 else 0

alpha = 0.5
beta = 0.5

pos = df["POS"].values
pos_sim = pos_similarity(pos)

definitions = df["Definition"]
def_sim = definition_similarity(definitions)
similarity = alpha * pos_sim + beta * def_sim

print(f"POS Similarity: {pos_sim:.2f}")
print(f"Definition Similarity: {def_sim:.2f}")
print(f"Overall Similarity: {similarity:.2f}")

## TF-IDF

In [None]:
import math
from collections import Counter
from typing import List, Dict

class TFIDFCalculator:
    def __init__(self, documents: List[List[str]]):
        self.documents = documents
        self.doc_count = len(documents)
        self.term_freq = [Counter(doc) for doc in documents]
        self.doc_lengths = [len(doc) for doc in documents]

    def calculate_tf(self, term: str, doc_idx: int) -> float:
        if self.doc_lengths[doc_idx] == 0:
            return 0
        return self.term_freq[doc_idx][term] / self.doc_lengths[doc_idx]

    def calculate_idf(self, term: str) -> float:
        doc_with_term = sum(1 for doc in self.documents if term in doc)
        if doc_with_term == 0:
            return 0
        return math.log2(self.doc_count / doc_with_term)

    def calculate_tfidf(self, terms: List[str]) -> Dict[str, List[float]]:
        results = {}
        for term in terms:
            idf = self.calculate_idf(term)
            tfidf_scores = [
                round(self.calculate_tf(term, doc_idx) * idf, 4)
                for doc_idx in range(self.doc_count)
            ]
            results[term] = tfidf_scores
        return results

def main():
    documents = [
        ["นวัตกรรม", "พลังงาน", "สะอาด", "เพื่อ", "โลก",
        "ยั่งยืน", "พลังงาน", "แสงอาทิตย์", "และ", "ลม",
        "กำลัง", "เป็นที่นิยม", "ใน", "ประเทศไทย", "นักวิทยาศาสตร์",
        "คาดว่า", "จะ", "ช่วย", "ลด", "การปล่อย",
        "ก๊าซ", "เรือนกระจก", "ได้", "อย่างมาก"],
        ["เศรษฐกิจ", "ไทย", "ฟื้นตัว", "หลัง", "โควิด",
        "การท่องเที่ยว", "และ", "การส่งออก", "เป็น", "ปัจจัย",
        "สำคัญ", "ใน", "การ", "ขับเคลื่อน", "เศรษฐกิจ",
        "รัฐบาล", "เร่ง", "ออก", "มาตรการ", "กระตุ้น"],
        ["นวัตกรรม", "ปัญญาประดิษฐ์", "ใน", "วงการ", "แพทย์",
        "AI", "ช่วย", "วินิจฉัย", "โรค", "ได้",
        "แม่นยำ", "ขึ้น", "โรงพยาบาล", "ใน", "ประเทศไทย",
        "เริ่ม", "นำ", "มา", "ใช้"],
        ["การเปลี่ยนแปลง", "สภาพ", "ภูมิอากาศ", "กระทบ", "ภาค",
        "เกษตร", "เกษตรกร", "ไทย", "ปรับตัว", "รับมือ",
        "ภัยแล้ง", "และ", "น้ำท่วม", "นักวิทยาศาสตร์", "เร่ง",
         "คิดค้น", "พันธุ์พืช", "ทนทาน"],
        ["พลังงาน", "นิวเคลียร์", "ทางเลือก", "หรือ", "ทางตัน",
        "ประเทศไทย", "ยัง", "ลังเล", "ใน", "การพัฒนา",
        "โรงไฟฟ้า", "นิวเคลียร์", "ขณะที่", "หลาย", "ประเทศ",
        "เดินหน้า", "เต็มที"],
        ["การพัฒนา", "เมือง", "อัจฉริยะ", "ใน", "ประเทศไทย",
        "กรุงเทพฯ", "และ", "เมือง", "ใหม่", "เร่ง",
        "ปรับตัว", "สู่", "Smart City", "ใช้", "เทคโนโลยี",
        "IoT", "เพื่อ", "ยกระดับ", "คุณภาพ", "ชีวิต"],
        ["วิกฤต", "ขยะ", "พลาสติก", "ใน", "ทะเลไทย",
        "นักวิทยาศาสตร์", "เตือน", "ผลกระทบ", "ต่อ", "ระบบนิเวศ",
        "รัฐบาล", "ออก", "มาตรการ", "ลด", "การใช้",
        "พลาสติก"],
        ["5G", "เปลี่ยน", "โฉม", "อุตสาหกรรม", "ไทย",
        "ผู้ประกอบการ", "เร่ง", "ปรับตัว", "รับ", "เทคโนโลยี",
        "ใหม่",  "คาด", "ช่วย", "เพิ่ม", "ประสิทธิภาพ",
        "การผลิต"],
        ["การท่องเที่ยว", "เชิงนิเวศ", "บูม", "ใน", "ไทย",
        "นักท่องเที่ยว", "ต่างชาติ", "สนใจ", "ธรรมชาติ", "และ",
        "วัฒนธรรม", "ท่องถิ่น", "ช่วย", "กระจาย", "รายได้",
        "สู่", "ชุมชน"],
        ["พลังงาน", "สะอาด", "กับ", "การพัฒนา", "ที่",
        "ยั่งยืน", "ประเทศไทย", "ตั้งเป้า", "เพิ่ม", "สัดส่วน",
        "พลังงาน", "หมุนเวียน", "นักลงทุน", "สนใจ","ลงทุน",
        "ใน", "โครงการ", "พลังงาน", "แสงอาทิตย์", "และ",
        "ลม"]
    ]
    target_words = ["พลังงาน", "นวัตกรรม", "เศรษฐกิจ", "ประเทศไทย", "เทคโนโลยี"]
    calculator = TFIDFCalculator(documents)
    results = calculator.calculate_tfidf(target_words)

    for term, scores in results.items():
        print(f"\nคำว่า '{term}':")
        for doc_idx, score in enumerate(scores, 1):
            if score > 0:
                print(f"D{doc_idx}: {score:.4f}")

if __name__ == "__main__":
    main()

## Cosine Similarity

In [None]:
import numpy as np
from numpy.linalg import norm

A = np.array([
  2, 1, 2, 3, 2, 9
  ])

B = np.array([
  3, 4, 2, 4, 5, 5
  ])

print("A:", A)
print("B:", B)

cosine = np.dot(A,B)/(norm(A)*norm(B))
print(f"Cosine Similarity: {cosine:.4f}")

## Sentiment Analysis

### Project: Language in Digital Media \n
**LG468 Language in Digital Media**

In [None]:
import requests
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pythainlp.corpus.common import thai_stopwords
from wordcloud import WordCloud, STOPWORDS

In [None]:
# API Configuration

API_KEY = 'kHIllIH4ODKsOvvi7QJINN5FIzf6sFgR'
API_FOR_THAI = "https://api.aiforthai.in.th"
SSSENSE_ENDPOINT = f"{API_FOR_THAI}/ssense"
TEXT_CLEANSING_ENDPOINT = f"{API_FOR_THAI}/textcleansing"

HEADERS = {"apikey": API_KEY}

In [None]:
# Data Loading and Preprocessing

def load_data(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = file.read().splitlines()
            return data
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return []

def cleanse_data(data):
    cleaned_data = []
    for text in data:
        response = requests.post(TEXT_CLEANSING_ENDPOINT, data={'text': text}, headers=HEADERS)
        cleaned_data.append(response.json()['cleansing_text'])
    return cleaned_data

data = load_data(r'datasets\sample50.csv')

cleaned_data = cleanse_data(data)

In [None]:
# Sentiment Analysis

def analyze_sentiment(data):
    text = []
    polarity = []
    confidence = []
    keywords = []
    poswords = []
    negwords = []

    for text_data in data:
        response = requests.post(SSSENSE_ENDPOINT, data={'text': text_data}, headers=HEADERS)
        if response.json()['sentiment']['score'] > '50':
            text.append(response.json()['preprocess']['input'])
            polarity.append(response.json()['sentiment']['polarity'])
            confidence.append(float(response.json()['sentiment']['score']))
            keywords.extend(response.json()['preprocess']['keyword'])
            if response.json()['preprocess']['pos']:
                poswords.extend(response.json()['preprocess']['pos'])
            if response.json()['preprocess']['neg']:
                negwords.extend(response.json()['preprocess']['neg'])

    return text, polarity, confidence, keywords, poswords, negwords

text, polarity, confidence, keywords, poswords, negwords = analyze_sentiment(cleaned_data)

In [None]:
# Data Processing and Output

def process_data(text, polarity, confidence):
    confidence_lst = list(zip(polarity, confidence))
    predicted_lst = list(zip(text, polarity))
    return confidence_lst, predicted_lst

confidence_lst, predicted_lst = process_data(text, polarity, confidence)

print(confidence_lst)
print(predicted_lst)

In [None]:
df = pd.DataFrame(confidence_lst, columns=['Sentiment', 'Confidence'])

sns.set_theme(style="whitegrid")

plt.figure(figsize=(10, 5))
sns.boxplot(x='Sentiment', y='Confidence', data=df)
plt.title("Confidence Scores by Sentiment")
plt.xlabel("Sentiment")
plt.ylabel("Confidence (%)")
plt.show()

In [None]:
df = pd.DataFrame(confidence_lst, columns=['Sentiment', 'Confidence'])

bins = np.linspace(50, 100, 10)

df['Confidence_Range'] = pd.cut(df['Confidence'], bins=bins, include_lowest=True)

pivot_df = df.pivot_table(values='Confidence', index='Confidence_Range', 
                          columns='Sentiment', aggfunc='count', fill_value=0)

pivot_df = pivot_df.sort_index(ascending=False)

plt.figure(figsize=(10, 8))
sns.heatmap(pivot_df, annot=False, cmap='YlOrRd', cbar_kws={'label': 'Count'})
plt.title("Confidence Scores by Sentiment")
plt.xlabel("Sentiment")
plt.ylabel("Confidence Score Ranges")
plt.tight_layout()
plt.show()

In [None]:
# Plotting Word Clouds

text_neg = " ".join(text for text, sentiment in predicted_lst if sentiment == 'negative')
text_pos = " ".join(text for text, sentiment in predicted_lst if sentiment == 'positive')

fp = 'THSarabunNew.ttf'
reg = r"[ก-๙a-zA-Z']+"
thai_stopwords = list(thai_stopwords())

wordcloud_neg = WordCloud(stopwords=thai_stopwords, background_color='white', max_words=2000,
                          height=2000, width=4000, font_path=fp, regexp=reg).generate(text_neg)

wordcloud_pos = WordCloud(stopwords=thai_stopwords, background_color='white', max_words=2000,
                          height=2000, width=4000, font_path=fp, regexp=reg).generate(text_pos)

fig, axs = plt.subplots(1, 2, figsize=(16, 8))

axs[0].imshow(wordcloud_neg, interpolation='bilinear')
axs[0].axis('off')
axs[0].set_title('Negative Sentiment')

axs[1].imshow(wordcloud_pos, interpolation='bilinear')
axs[1].axis('off')
axs[1].set_title('Positive Sentiment')

plt.show()