In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

import warnings
warnings.simplefilter("ignore")

# Setup Dataset & Check Dataset

In [2]:
data = pd.read_csv("language_detection.csv")

In [3]:
data.head(5)

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [4]:
data["Language"].value_counts()

Language
English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: count, dtype: int64

In [5]:
y = data["Text"]
y = data["Language"]

# Trainning Model

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [7]:
data_list = []
for text in data.Text:
    text = re.sub(r'[!@#$(),\n"%^*?\:;~`0-9]', ' ', text)
    text = re.sub(r'[[]]', ' ', text)
    text = text.lower()
    data_list.append(text)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
# from skelarn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(data_list).toarray()

In [9]:
X.shape

(10337, 39404)

In [10]:
from sklearn.model_selection import  train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

: 

: 

: 

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

In [None]:
print("Accuracy is : ", ac)

In [None]:
print(cr)

In [None]:
# visualising the confusion matrix
plt.figure(figsize=(15,10))
sns.heatmap(cm, annot = True)
plt.show()

# Model Saving

In [None]:
model_and_vectorizer = {
    'model': model,
    'vectorizer': cv,
    'label_encoder': le.classes_
}

print(le.classes_)

with open("model_and_label.pkl", "wb") as f:
    pickle.dump(model_and_vectorizer, f)

# Used Model

In [None]:
with open("model.pkl", "rb") as f:
    loaded_data = pickle.load(f)

# Ekstrak model dari dictionary
model = loaded_data['model']  # Ambil model
vectorizer = loaded_data['vectorizer']  # Ambil vectorizer
label_encoder_classes = loaded_data['label_encoder']  # Ambil label encoder

# Pastikan memanggil predict pada model, bukan dictionary
input_text = ["hello"]
input_vector = vectorizer.transform(input_text)  # Vectorize input
predicted_class = model.predict(input_vector)  # Prediksi menggunakan model

# Jika ingin mendekode hasil prediksi ke label asli:
predicted_label = label_encoder_classes[predicted_class[0]]
print(predicted_label)