In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud,STOPWORDS
import numpy as np
%matplotlib inline

In [None]:
df=pd.read_csv("/kaggle/input/language-identification-datasst/dataset.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df["language"].value_counts()

In [None]:
plt.figure(figsize=(7,5))
sns.countplot(y="language",data=df,palette="tab10")
plt.show()

In [None]:
df = df.drop_duplicates(subset='Text')
df = df.reset_index(drop=True)

In [None]:
plt.figure(figsize=(7,5))
sns.countplot(y="language",data=df,palette="tab10")
plt.show()

In [None]:
english_text_df = df[df['language'] == 'English']
stopwords = set(STOPWORDS)
text2 = "  ".join(review for review in english_text_df['Text'])  
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="skyblue", stopwords=stopwords).generate(text2)
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
swedish_text_df = df[df['language'] == 'Swedish']
text2 = "  ".join(review for review in swedish_text_df['Text'])  # Assuming 'text' is the column containing the Swedish text
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="orange", stopwords=stopwords).generate(text2)
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


In [None]:
dutch_text_df = df[df['language'] == 'Dutch']
text2 = "  ".join(review for review in dutch_text_df['Text'])  
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="black", stopwords=stopwords).generate(text2)
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()



In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["language"]=le.fit_transform(df["language"])

In [None]:
df["language"].unique()

In [None]:
# decoded_languages = le.inverse_transform(df["language"])
# df["decoded_language"] = decoded_languages

# print(df[["decoded_language", "language"]])


In [None]:
decoded_languages = le.inverse_transform(df["language"])
df["decoded_language"] = decoded_languages
unique_decoded_languages = sorted(df["decoded_language"].unique())
unique_languages = sorted(df["language"].unique())

print("======================================================================================================================")
print("Unique Decoded Languages (Ascending Order):", unique_decoded_languages)
print("======================================================================================================================")
print("Unique Languages (Ascending Order):", unique_languages)
print("======================================================================================================================")

In [None]:
df.head()

In [None]:
total_length = sum(len(text) for text in df["Text"])
num_texts = len(df["Text"])
average_length = total_length / num_texts
print("Average text length:", average_length)

# Data Cleaning

In [None]:
from nltk.corpus import stopwords
import re
import unicodedata
from bs4 import BeautifulSoup


def clean_text(text):

    # Remove HTML tags if present
    if "<" in text:
        text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URL addresses
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove accented characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # Remove punctuation
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)

    # Remove irrelevant characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    text = ' '.join(filtered_words)

    return text

In [None]:
df["Text"] = df["Text"].apply(clean_text)

In [None]:
df.head()

In [None]:
X=df["Text"]
Y=df["language"]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
tf= TfidfVectorizer()
train_data=tf.fit_transform(X)
print(train_data)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(train_data,Y,test_size=0.3,random_state=42)

In [None]:
svm= SVC()  
svm.fit(X_train, Y_train)

In [None]:
y_pred = svm.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

![](https://static.packt-cdn.com/products/9781838555078/graphics/C13314_06_05.jpg)

# Confusion matrix

In [None]:
cf=confusion_matrix(Y_test,y_pred)
label_name=unique_decoded_languages
plt.figure(figsize=(15,10))
sns.heatmap(cf,annot=True,fmt="d",xticklabels=label_name,yticklabels=label_name,cmap="gnuplot",
           linewidths=3, linecolor='navy')
plt.title("Confusion Matrix",fontsize=20,color="red")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Classification Report

In [None]:
print(classification_report(Y_test,y_pred,target_names=label_name))

# Roc Auc

In [None]:
from yellowbrick.classifier import ROCAUC




model = SVC()
visualizer = ROCAUC(model, classes=label_name)

visualizer.fit(X_train, Y_train)
visualizer.score(X_test, Y_test)
plt.figure(figsize=(20, 15))

visualizer.show()


# Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
nb= MultinomialNB()
nb.fit(X_train,Y_train)

In [None]:
pred1=nb.predict(X_test)
accuracy1 = accuracy_score(Y_test,pred1)
print(f"Accuracy: {accuracy1 * 100:.2f}%")

In [None]:
cf0=confusion_matrix(Y_test,pred1)
label_name=unique_decoded_languages
plt.figure(figsize=(15,10))
sns.heatmap(cf0,annot=True,fmt="d",xticklabels=label_name,yticklabels=label_name,cmap="rainbow",
           linewidths=3, linecolor='orange')
plt.title("Confusion Matrix",fontsize=20,color="red")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
print(classification_report(Y_test,pred1,target_names=label_name))

In [None]:
model = MultinomialNB()
visualizer = ROCAUC(model, classes=label_name)

visualizer.fit(X_train, Y_train)
visualizer.score(X_test, Y_test)
plt.figure(figsize=(20, 15))

visualizer.show()

# CountVectorizer

In [None]:
import numpy as np
X_data=df["Text"]
Y_data=df["language"]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
new_data=vectorizer.fit_transform(X_data)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(new_data,Y_data,test_size=0.3,random_state=42)

In [None]:
new_svm = SVC()  
new_svm.fit(x_train, y_train)

In [None]:
new_pred= new_svm.predict(x_test)
new_accuracy = accuracy_score(y_test,new_pred)
print(f"Accuracy: {new_accuracy * 100:.2f}%")

In [None]:
cf2=confusion_matrix(y_test,new_pred)
label_name=unique_decoded_languages
plt.figure(figsize=(15,10))
sns.heatmap(cf2,annot=True,fmt="d",xticklabels=label_name,yticklabels=label_name,cmap="turbo",
           linewidths=3, linecolor='black')
plt.title("Confusion Matrix",fontsize=20,color="red")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
print(classification_report(y_test,new_pred,target_names=label_name))

In [None]:
new_model = SVC()
visualizer = ROCAUC(new_model, classes=label_name)

visualizer.fit(x_train, y_train)
visualizer.score(x_test, y_test)
plt.figure(figsize=(20, 15))

visualizer.show()

# naive Bayes+Countvectorizer

In [None]:
new_nb= MultinomialNB()
new_nb.fit(x_train,y_train)

In [None]:
pred3=nb.predict(x_test)
accuracy1 = accuracy_score(y_test,pred3)
print(f"Accuracy: {accuracy1 * 100:.2f}%")

In [None]:
cf4=confusion_matrix(y_test,pred3)
label_name=unique_decoded_languages
plt.figure(figsize=(15,10))
sns.heatmap(cf4,annot=True,fmt="d",xticklabels=label_name,yticklabels=label_name,cmap="gist_ncar",
           linewidths=3, linecolor='black')
plt.title("Confusion Matrix",fontsize=20,color="red")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
print(classification_report(y_test,pred3,target_names=label_name))

In [None]:
new_mo = MultinomialNB()
visualizer = ROCAUC(new_mo, classes=label_name)

visualizer.fit(x_train, y_train)
visualizer.score(x_test, y_test)
plt.figure(figsize=(20, 15))

visualizer.show()