In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
import string

data_path = "C:\\Users\\ASUS\\OneDrive\\Documents\\jupyter\\tatoeba_data.csv"
df = pd.read_csv(data_path)

df.head()

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['processed_sentence'] = df['Source sentence'].apply(preprocess_text)

df.head()

In [None]:
X = df['processed_sentence']
y = df['Target language']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [None]:
y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

In [None]:
def predict_language(sentence):
    processed_sentence = preprocess_text(sentence)
    sentence_tfidf = vectorizer.transform([processed_sentence])
    prediction = model.predict(sentence_tfidf)
    return prediction[0]

sentences_to_predict = [
    "wie al bist du",
    "Je parle français",
    "Wo ist die nächste Haltestelle?",
    "What time is it?",
    "La météo est agréable aujourd'hui"
]

predictions = [predict_language(sentence) for sentence in sentences_to_predict]

for sentence, lang in zip(sentences_to_predict, predictions):
    print(f"Kalimat: {sentence} -> Bahasa: {lang}")