In [None]:
from pythainlp.corpus.common import thai_stopwords
from wordcloud import WordCloud, STOPWORDS
from pythainlp import word_tokenize
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

In [None]:
class ThaiSentimentAnalyzer:
    def __init__(self):
        self.thai_stopwords = list(thai_stopwords())
        self.vectorizer = None
        self.model = None
        
    def _text_process(self, text):
        final = "".join(u for u in text if u not in ("?", ".", ";", ":", "!", '"', "ๆ", "ฯ"))
        final = word_tokenize(final)
        final = " ".join(word for word in final)
        final = " ".join(word for word in final.split() 
                        if word.lower not in self.thai_stopwords)
        return final
    
    def load_data(self, file_path, separator='\t', names=['text', 'sentiment'], 
                 header=None, skiprows=0):
        df = pd.read_csv(file_path, sep=separator, names=names, 
                        header=header, skiprows=skiprows)
        return df.replace(np.nan, '', regex=True)
    
    def prepare_data(self, df):
        df['text_tokens'] = df['text'].apply(self._text_process)
        x = df[['text_tokens']]
        y = df['sentiment']
        return train_test_split(x, y, test_size=0.3, random_state=101)
    
    def train(self, X_train, y_train, max_iter=20000):
        self.vectorizer = CountVectorizer(analyzer=lambda x: x.split(' '))
        train_bow = self.vectorizer.fit_transform(X_train['text_tokens'])
        
        self.model = LogisticRegression(max_iter=max_iter)
        self.model.fit(train_bow, y_train)
    
    def evaluate(self, X_test, y_test):
        test_bow = self.vectorizer.transform(X_test['text_tokens'])
        test_predictions = self.model.predict(test_bow)
        return classification_report(test_predictions, y_test)
    
    def predict(self, texts):
        if not self.vectorizer or not self.model:
            raise ValueError("Model not trained. Please train the model first.")
            
        results = []
        for text in texts:
            processed_text = self._text_process(text)
            bow = self.vectorizer.transform(pd.Series([processed_text]))
            prediction = self.model.predict(bow)[0]
            results.append((text, prediction))
        return results

if __name__ == "__main__":
    analyzer = ThaiSentimentAnalyzer()

    dataset1 = analyzer.load_data('datasets/sample.csv')
    X_train, X_test, y_train, y_test = analyzer.prepare_data(dataset1)
    analyzer.train(X_train, y_train)
    print("Model 1 Performance:")
    print(analyzer.evaluate(X_test, y_test))
    
    analyzer2 = ThaiSentimentAnalyzer()
    dataset2 = analyzer2.load_data('datasets/wisesight_2.csv', 
                                 separator=',', skiprows=1)
    X_train2, X_test2, y_train2, y_test2 = analyzer2.prepare_data(dataset2)
    analyzer2.train(X_train2, y_train2)
    print("\nModel 2 Performance:")
    print(analyzer2.evaluate(X_test2, y_test2))
    
    new_texts = ["ฉันรักการเรียนรู้ภาษาไทย", "อาหารนี้แย่มาก"]
    predictions = analyzer.predict(new_texts)
    print("\nPredictions from Model 1:")
    for text, sentiment in predictions:
        print(f"Text: {text}\nSentiment: {sentiment}\n")