In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC  # Menggunakan LinearSVC untuk klasifikasi teks
from sklearn.pipeline import Pipeline
import pickle
from google.colab import drive

In [30]:
# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Load dataset
file_path = '/content/drive/MyDrive/UAS_NLP/Kamus_Bahasa_Sunda_Indonesia.csv'
data = pd.read_csv(file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
print(data.head())

  No     Sunda Indonesia
0   1   Angel     Bantal
1   2   Ayeuna  sekarang
2   3   Angger     Tetap
3   4    Anggo     Pakai
4   5  Anggoan   Pakaian


In [32]:
# 3. Preprocessing
data['Sunda'] = data['Sunda'].astype(str).fillna('')
data['Indonesia'] = data['Indonesia'].astype(str).fillna('')

In [33]:
# 4. Split data
X_train, X_test, y_train, y_test = train_test_split(
    data['Sunda'], data['Indonesia'], test_size=0.4, random_state=42
)

In [34]:
# 5. Create pipeline
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', LinearSVC(
    )),  # Menggunakan LinearSVC sebagai classifier
])


In [35]:
# 6. Train model
model.fit(X_train, y_train)

In [36]:
# 7. Function to translate and handle unknown words
def translate_sunda_to_indonesia(text):
    try:
        predicted_translation = model.predict([text])[0]
    except KeyError:  # Handle KeyError if word not in vocabulary
        predicted_translation = "Unknown"  # Or handle differently, e.g., return original word
    return predicted_translation

In [37]:
# (Optional) Evaluate the model
# 8. Test the translation
sunda_text = "anggo"  # Replace with the Sunda text you want to translate
indonesia_translation = translate_sunda_to_indonesia(sunda_text)
print(f"Sunda: {sunda_text}")
print(f"Indonesia: {indonesia_translation}")

# (Optional) Evaluate the model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, model.predict(X_test))
print(f"Accuracy: {accuracy}")

Sunda: anggo
Indonesia: Pakai
Accuracy: 0.33551769331585846


In [38]:
# Save the model before attempting to load it.
path = '/content/drive/My Drive/UAS_NLP/TMM_Sunda-Indonesia_model.pkl'
with open(path, 'wb') as file:  # Use 'wb' for writing binary
    pickle.dump(model, file)

In [39]:
# Now, try loading the model
with open(path, 'rb') as file:
    loaded_model = pickle.load(file)

In [40]:
# Now you can use loaded_model for predictions
sunda_text = "abdi"
indonesia_translation = loaded_model.predict([sunda_text])[0]
print(f"Sunda: {sunda_text}")
print(f"Indonesia: {indonesia_translation}")

Sunda: abdi
Indonesia: Saya


In [41]:
print("apakah tfidf sudah di-fit?")
print("idf_" in dir(model.named_steps['tfidf']))

apakah tfidf sudah di-fit?
True
