In [None]:
!pip install pythainlp

Collecting pythainlp
  Downloading pythainlp-5.0.4-py3-none-any.whl (17.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.9/17.9 MB[0m [31m64.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pythainlp
Successfully installed pythainlp-5.0.4


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus import thai_stopwords
import string

# Function to preprocess Thai text
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text, engine='newmm')
    # Remove punctuation and stopwords
    tokens = [word for word in tokens if word not in string.punctuation and word not in thai_stopwords()]
    return ' '.join(tokens)

# Load dev.csv
dev_data = pd.read_csv('dev.csv')

# Preprocess text
dev_data['processed_text'] = dev_data['text'].apply(preprocess_text)

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(dev_data['processed_text'])

# Function to extract top keywords
def extract_keywords(text, vectorizer, top_n=5):
    # Preprocess the text
    text = preprocess_text(text)
    # Vectorize the text
    text_vectorized = vectorizer.transform([text])
    # Calculate similarity with each document in X
    similarities = cosine_similarity(text_vectorized, X)
    # Get indices of top N most similar documents
    top_indices = similarities.argsort()[0][-top_n:][::-1]
    # Get corresponding keywords
    keywords = []
    for idx in top_indices:
        keywords.extend(dev_data.iloc[idx]['keywords'].split('|'))
    # Return unique keywords
    return '|'.join(list(dict.fromkeys(keywords))[:top_n])

# Load test.csv
test_data = pd.read_csv('test.csv')

# Generate predictions for test data
test_data['keywords'] = test_data['text'].apply(lambda x: extract_keywords(x, vectorizer))

# Save output to output.csv
test_data[['id', 'keywords']].to_csv('output.csv', index=False)
