In [22]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import joblib
import re

In [29]:
# Ubah path sesuai dengan lokasi file Anda
file_path = r"C:\Users\ASUS\OneDrive\Documents\jupyter\UAS\tatoebas_data.csv"

# Muat data Tatoeba secara manual
with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Proses setiap baris untuk memisahkan kalimat dan bahasa
data = []
for line in lines:
    match = re.match(r'^(.*?)\s*\((.*?)\)\s*$', line.strip())
    if match:
        sentence, language = match.groups()
        data.append([sentence, language])

# Konversi ke DataFrame
df = pd.DataFrame(data, columns=['Source sentence', 'Target language'])

# Tampilkan beberapa baris pertama untuk verifikasi
print(df.head(50))

       Source sentence                Target language
0        Selamat pagi!                      Indonesia
1        Good morning!                        Inggris
2         Buenos días!                        Spanyol
3             Bonjour!                       Perancis
4        Guten Morgen!                         Jerman
5          Buongiorno!                         Italia
6            おはようございます     Ohayou gozaimasu)! (Jepang
7                안녕하세요        Annyeonghaseyo)! (Korea
8                  早上好       Zǎoshang hǎo)! (Tiongkok
9          Доброе утро           Dobroe utro)! (Rusia
10          صباح الخير         Sabah al-khayr)! (Arab
11          शुभ प्रभात         Shubh Prabhat)! (India
12            Bom dia!                       Portugal
13           Günaydın!                          Turki
14  Habari ya asubuhi!                        Swahili
15     Chào buổi sáng!                        Vietnam
16       สวัสดีตอนเช้า  Sawasdee ton chao)! (Thailand
17            Καλημέρα      

In [32]:
# Persiapkan fitur dan target
X = df['Source sentence']
y = df['Target language']

# Vectorize text
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Bagi data menjadi train dan test set
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Latih model
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluasi model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Akurasi: {accuracy * 100:.2f}%')

# Simpan model dan vectorizer
joblib.dump(model, 'language_prediction_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

# Definisikan fungsi untuk memprediksi bahasa dan negara asal kalimat
def predict_language_and_country(sentence):
    sentence_vectorized = vectorizer.transform([sentence])
    prediction = model.predict(sentence_vectorized)[0]
    
    # Mapping bahasa ke negara
    language_to_country = {
        'Indonesia': 'Indonesia',
        'Inggris': 'England',
        'Spanyol': 'Spain',
        'Perancis': 'France',
        'Jerman': 'Germany',
        'Italia': 'Italy',
        'Jepang': 'Japan',
        'Korea': 'Korea',
        'swedia' : 'sweden',
        
        # Tambahkan bahasa dan negara lain sesuai dengan data Anda
    }
    
    country = language_to_country.get(prediction, 'Unknown')
    return prediction, country

# Prediksi bahasa dan negara untuk kalimat baru
new_sentence = "Good morning!   "
language, country = predict_language_and_country(new_sentence)
print(f'Kalimat: "{new_sentence}"')
print(f'Bahasa Prediksi: {language}')
print(f'Negara Asal: {country}')

Akurasi: 0.00%
Kalimat: "Good morning!   "
Bahasa Prediksi: Inggris
Negara Asal: England
