In [1]:
def extract_features(sentence, index):
    """Mengekstrak fitur dari sebuah kata pada posisi tertentu dalam kalimat."""
    word = sentence[index]
    
    return {
        'word': word.lower(),
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': word.istitle(),
        'is_all_caps': word.isupper(),
        'is_all_lower': word.islower(),
        'prefix-1': word[0],
        'prefix-2': word[:2],
        'prefix-3': word[:3],
        'suffix-1': word[-1],
        'suffix-2': word[-2:],
        'suffix-3': word[-3:],
        'prev_word': '' if index == 0 else sentence[index - 1].lower(),
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1].lower(),
        'is_numeric': word.isdigit(),
    }

In [2]:
import nltk
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Pastikan dataset sudah ada
try:
    nltk.data.find('corpora/treebank/tagged')
except nltk.downloader.DownloadError:
    nltk.download('treebank')

# --- Langkah 1 & 2: Ekstraksi Fitur dan Menyiapkan Dataset ---

def extract_features(sentence, index):
    word = sentence[index][0] # Ambil katanya
    return {
        'word': word.lower(),
        'is_capitalized': word.istitle(),
        'is_all_caps': word.isupper(),
        'suffix-3': word[-3:],
        'prev_word': '' if index == 0 else sentence[index - 1][0].lower(),
        'is_numeric': word.isdigit(),
    }

# Muat data dan bagi menjadi training & testing
tagged_sents = nltk.corpus.treebank.tagged_sents()
train_size = int(len(tagged_sents) * 0.8)
train_sents = tagged_sents[:train_size]
test_sents = tagged_sents[train_size:]

# Ubah dataset menjadi format fitur dan target (X dan y)
X_train_features = []
y_train = []
for sent in train_sents:
    for i in range(len(sent)):
        X_train_features.append(extract_features(sent, i))
        y_train.append(sent[i][1]) # Tagnya adalah target kita

# --- Langkah 3: Vectorize Fitur dan Latih Model ---

# Buat Vectorizer: mengubah kamus fitur menjadi vektor angka
vectorizer = DictVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train_features)

print("Melatih model Logistic Regression...")
# Buat dan latih model
# max_iter=1000 agar model punya cukup waktu untuk belajar
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_vectorized, y_train)
print("Pelatihan selesai!\n")

# --- Langkah 4: Evaluasi dan Mencoba Model ---

# Siapkan data tes dengan cara yang sama
X_test_features = []
y_test = []
for sent in test_sents:
    for i in range(len(sent)):
        X_test_features.append(extract_features(sent, i))
        y_test.append(sent[i][1])

# Gunakan vectorizer yang SAMA untuk mengubah data tes
X_test_vectorized = vectorizer.transform(X_test_features)

# Prediksi dan hitung akurasi
y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f"Akurasi model pada data tes: {accuracy * 100:.2f}%\n")

# Coba pada kalimat baru
print("--- Mencoba pada kalimat baru ---")
new_sentence = "Learning POS Tagging is challenging"
words_to_tag = new_sentence.split()

# Buat dummy sentence format untuk ekstraksi fitur
dummy_tagged_sentence = [(word, '') for word in words_to_tag]

# Ekstrak fitur dan transform
new_features = [extract_features(dummy_tagged_sentence, i) for i in range(len(dummy_tagged_sentence))]
new_vectorized = vectorizer.transform(new_features)

# Prediksi!
predicted_tags = model.predict(new_vectorized)

# Tampilkan hasil
for word, tag in zip(words_to_tag, predicted_tags):
    print(f"('{word}', '{tag}')")

Melatih model Logistic Regression...
Pelatihan selesai!

Akurasi model pada data tes: 94.50%

--- Mencoba pada kalimat baru ---
('Learning', 'NNP')
('POS', '-NONE-')
('Tagging', 'NNP')
('is', 'VBZ')
('challenging', 'VBG')
