# **Language Identifier Using Neural Networks**

## **Data Loading and Preprocessing**

In [7]:
# imports
import os
import re
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.regularizers import l2
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix


### Load and merge datasets

In [8]:
dataset_path = r"C:\Users\user\Language-ID\data\Language_ID\masakhanews"
processed_path = r"C:\Users\user\Language-ID\data\Language_ID\masakhanews\processed"

# target laguages
"""
'amh' for Amharic, 'ibo' for Igbo
'orm' for Oromo, 'pcm' for Nigerian Pidgin
'run' for Rundi, 'sna' for chiShona
"""
target_languages = ["amh", "ibo", "orm", "pcm", "run", "sna"]

def load_and_merge(split):
  """
  loads all datasets for a given split (train/test/val),
  adds a 'lang' column, and merges them into a single df
  """
  all_data = []

  for lang in target_languages:
    file_path = os.path.join(dataset_path, lang, f"{split}.csv")
    if os.path.exists(file_path):
      df = pd.read_csv(file_path)
      df["lang"] = lang
      all_data.append(df)

  return pd.concat(all_data, ignore_index=True)

#load datasets
train_data = load_and_merge("train")
test_data = load_and_merge("test")
val_data = load_and_merge("validation")

#save datasets to /processed
train_data.to_csv(os.path.join(processed_path, "train.csv"), index=False)
test_data.to_csv(os.path.join(processed_path, "test.csv"), index=False)
val_data.to_csv(os.path.join(processed_path, "validation.csv"), index=False)

print("Datasets merged and saved!")


Datasets merged and saved!


In [9]:
df = pd.read_csv(r"C:\Users\user\Language-ID\data\Language_ID\masakhanews\processed\train.csv")
print(df.head())

   label                                          headline  \
0      5  የስፖርት ኮከቦች እና የንግድ ምልክቶቻቸው- ከቦልት እስከ ክርስቲያኖ ሮናልዶ   
1      5         እግር ኳስ፡ ዩናይትድ፣ አርሴናል፣ ቼልሲ . . . ምን አስበዋል?   
2      0              ዓለምን ካስጨነቃት የዋጋ ንረት ተጠቃሚዎቹ እነማን ናቸው?   
3      2         ኮሮናቫይረስ፡ በቫይረሱ የሞቱት የሮማኒያው ከንቲባ በምርጫ አሸነፉ   
4      2       ኮሮናቫይረስ፡ አውሮፕላኖች እንዴት ነው በፀረ- ተህዋሲያን የሚፀዱት?   

                                                text  \
0  የአትሌቲክሱ ዓለም ኮከብ እና ፈጣኑ ሰው ዩሴን ቦልት ከውድድር በፊት እና...   
1  የስፖርት ጋዜጦች ስለ እግር ኳስ ምን እያሉ ነው? በሚቀጥለው ጥር የሚከፈ...   
2  ከኮሮናቫይረስ ወረርሽኝ ተጽእኖ ሳያገግም የዩክሬን እና ሩሲያ ጦርነት የገ...   
3  በኮሮናቫይረስ የሞቱት የሮማኒያው ከንቲባ በቅርቡ የተደረገውን ምርጫ በከፍ...   
4  የኮሮናቫይረስ ወረርሽኝ መከሰቱን ተከትሎ  ቀጥ ብሎ የነበረውን የአለም የ...   

                                       headline_text  \
0  የስፖርት ኮከቦች እና የንግድ ምልክቶቻቸው- ከቦልት እስከ ክርስቲያኖ ሮና...   
1  እግር ኳስ፡ ዩናይትድ፣ አርሴናል፣ ቼልሲ . . . ምን አስበዋል? የስፖር...   
2  ዓለምን ካስጨነቃት የዋጋ ንረት ተጠቃሚዎቹ እነማን ናቸው? ከኮሮናቫይረስ ...   
3  ኮሮናቫይረስ፡ በቫይረሱ የሞቱት የሮማኒያው ከንቲባ በምርጫ አሸነፉ በኮሮና...   
4  ኮሮናቫይረስ

### Data Preprocessing

In [10]:
def clean_text(text):
  """cleans text data"""
  if text is None:
    return ""
  
  # convert into lowercase
  text = text.lower()

  #remove urls
  text = re.sub(r'https?://\S+|www\.\S+', '', text)

  #remove html tags
  txt = re.sub(r'<.*?>', '', text)
  return text.strip()

def load_and_preprocess(processed_path, target_languages):
  """
  load and preprocess data from csvs, uisng the 'text' column
  returns:
   preprocessed train, val and test data and labes, tokenizer, max length, num classes
  """
  train_df = pd.read_csv(os.path.join(processed_path, "train.csv"), usecols=["text", "lang"]).dropna()
  val_df = pd.read_csv(os.path.join(processed_path, "validation.csv"), usecols=["text", "lang"]).dropna()
  test_df = pd.read_csv(os.path.join(processed_path, "test.csv"), usecols=["text", "lang"]).dropna()

  # apply cleaning to 'text' column
  train_df['text'] = train_df['text'].apply(clean_text)
  val_df['text'] = val_df['text'].apply(clean_text)
  test_df['text'] = test_df['text'].apply(clean_text)

  #prepare text for tokenizer fitting from the 'text' column
  all_train_texts = train_df['text'].tolist()

  #create and fit tokenizer on training data
  tokenizer = Tokenizer(char_level=True, oov_token='<unk>')
  tokenizer.fit_on_texts(all_train_texts)

  # max sequence length
  max_length = min(max([len(text) for text in all_train_texts]), 500)

  # convert texts to sequences and pad
  X_train = tokenizer.texts_to_sequences(train_df['text'].tolist())
  X_train = pad_sequences(X_train, maxlen=max_length)

  X_val = tokenizer.texts_to_sequences(val_df['text'].tolist())
  X_val = pad_sequences(X_val, maxlen=max_length)

  X_test = tokenizer.texts_to_sequences(test_df['text'].tolist())
  X_test = pad_sequences(X_test, maxlen=max_length)

  # convert language labels into numerical format
  language_to_index = {lang: i for i, lang in enumerate(target_languages)}
  y_train = to_categorical(train_df['lang'].map(language_to_index), num_classes=len(target_languages))
  y_val = to_categorical(val_df['lang'].map(language_to_index), num_classes=len(target_languages))
  y_test = to_categorical(test_df['lang'].map(language_to_index), num_classes=len(target_languages))

  return (X_train, y_train), (X_val, y_val), (X_test, y_test), tokenizer, max_length, len(target_languages)



## **Model Architecture & Training**

In [15]:

def build_cnn_model(vocab_size, max_length, num_classes, embedding_dim=64):
    """
    Build a CNN model for language identification.
    
    Args:
        vocab_size: Size of the vocabulary (number of unique characters)
        max_length: Maximum sequence length
        num_classes: Number of languages to classify
        embedding_dim: Dimension of character embeddings
    
    Returns:
        Compiled Keras model
    """
    model = Sequential([
        # Character embedding layer
        Embedding(input_dim=vocab_size + 1,  # +1 for padding token
                 output_dim=embedding_dim,
                 input_length=max_length),
        
        Conv1D(64, 3, activation='relu', padding='same', kernel_regularizer=l2(0.01)),
        MaxPooling1D(pool_size=2),
        
        Conv1D(128, 3, activation='relu', padding='same', kernel_regularizer=l2(0.01)),
        MaxPooling1D(pool_size=2),
        
        Conv1D(128, 3, activation='relu', padding='same', kernel_regularizer=l2(0.01)),
        GlobalMaxPooling1D(),
        
        Dense(64, activation='relu'),
        Dropout(0.6),
        Dense(num_classes, activation='softmax')
    ])
    
    # compile model
    model.compile(loss='categorical_crossentropy',
                  optimizer='adamw',
                  metrics=['accuracy'])
    
    return model


def train_model(model, X_train, y_train, X_val, y_val, batch_size=64, epochs=10, patience=3):
    """
    Train the CNN model with early stopping.
    """
    # Define callbacks
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_accuracy',
        patience=patience,
        restore_best_weights=True
    )
    
    # Train model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        batch_size=batch_size,
        epochs=epochs,
        callbacks=[early_stopping],
        verbose=1
    )
    
    return model, history


In [16]:
(X_train, y_train), (X_val, y_val), (X_test, y_test), tokenizer, max_length, num_classes = load_and_preprocess(processed_path, target_languages)

vocab_size = len(tokenizer.word_index)
model = build_cnn_model(vocab_size, max_length, num_classes)

model, history = train_model(model, X_train, y_train, X_val, y_val, batch_size=64, epochs=10, patience=3)


Epoch 1/10




[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 186ms/step - accuracy: 0.2705 - loss: 3.1345 - val_accuracy: 0.5971 - val_loss: 0.9371
Epoch 2/10
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 176ms/step - accuracy: 0.5817 - loss: 0.9866 - val_accuracy: 0.7471 - val_loss: 0.7007
Epoch 3/10
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 157ms/step - accuracy: 0.6672 - loss: 0.7957 - val_accuracy: 0.7048 - val_loss: 0.6529
Epoch 4/10
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 159ms/step - accuracy: 0.7444 - loss: 0.6671 - val_accuracy: 0.8442 - val_loss: 0.5067
Epoch 5/10
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 162ms/step - accuracy: 0.7930 - loss: 0.6148 - val_accuracy: 0.8913 - val_loss: 0.4491
Epoch 6/10
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 165ms/step - accuracy: 0.8468 - loss: 0.5207 - val_accuracy: 0.8885 - val_loss: 0.4070
Epoch 7/10
[1m114/11

### Model Evaluation

In [17]:
def evaluate_model(model, X_test, y_test, language_codes):
    """
    Evaluate the model and generate comprehensive metrics.
    
    Args:
        model: Trained Keras model
        X_test: Test data
        y_test: One-hot encoded test labels
        language_codes: List of language codes in the same order as labels
    
    Returns:
        Dictionary of evaluation metrics
    """
    # Make predictions
    y_pred_proba = model.predict(X_test)
    y_pred = np.argmax(y_pred_proba, axis=1)
    y_true = np.argmax(y_test, axis=1)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)
    
    # Calculate precision, recall, and F1 for each language
    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, average=None, labels=range(len(language_codes))
    )
    
    # Create a confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Prepare metrics dictionary
    metrics = {
        "accuracy": float(accuracy),
        "language_metrics": {}
    }
    
    # Add per-language metrics
    for i, lang in enumerate(language_codes):
        metrics["language_metrics"][lang] = {
            "precision": float(precision[i]),
            "recall": float(recall[i]),
            "f1_score": float(f1[i]),
            "support": int(support[i])
        }
    
    return metrics, cm

def save_metrics(metrics, output_dir="results"):
    """Save metrics to a JSON file."""
    os.makedirs(output_dir, exist_ok=True)
    
    with open(os.path.join(output_dir, "evaluation_metrics.json"), "w") as f:
        json.dump(metrics, f, indent=4)

def plot_confusion_matrix(cm, language_codes, output_dir="results"):
    """Plot and save the confusion matrix."""
    os.makedirs(output_dir, exist_ok=True)
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
                xticklabels=language_codes, 
                yticklabels=language_codes)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "confusion_matrix.png"))
    plt.close()

def plot_training_history(history, output_dir="results"):
    """Plot and save the training history."""
    os.makedirs(output_dir, exist_ok=True)
    
    # Plot accuracy
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    
    # Plot loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "training_history.png"))
    plt.close()

In [18]:
(X_train, y_train), (X_val, y_val), (X_test, y_test), tokenizer, max_length, num_classes = load_and_preprocess(
  processed_path = r"C:\Users\user\Language-ID\data\Language_ID\masakhanews\processed",
  target_languages = ["amh", "ibo", "orm", "pcm", "run", "sna"]
)

language_codes = ["amh", "ibo", "orm", "pcm", "run", "sna"]

# Evaluate the model
metrics, cm = evaluate_model(model, X_test, y_test, language_codes)

# Save metrics to a JSON file
save_metrics(metrics, output_dir="results")

# Plot and save confusion matrix
plot_confusion_matrix(cm, language_codes, output_dir="results")

# Plot and save training history
plot_training_history(history, output_dir="results")

# Print evaluation results
print(json.dumps(metrics, indent=4))



[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step
{
    "accuracy": 0.9702922855773838,
    "language_metrics": {
        "amh": {
            "precision": 1.0,
            "recall": 0.9973404255319149,
            "f1_score": 0.9986684420772304,
            "support": 376
        },
        "ibo": {
            "precision": 0.9974293059125964,
            "recall": 0.9948717948717949,
            "f1_score": 0.9961489088575096,
            "support": 390
        },
        "orm": {
            "precision": 0.9587301587301588,
            "recall": 0.9292307692307692,
            "f1_score": 0.94375,
            "support": 325
        },
        "pcm": {
            "precision": 0.9404388714733543,
            "recall": 0.9836065573770492,
            "f1_score": 0.9615384615384616,
            "support": 305
        },
        "run": {
            "precision": 0.9161676646706587,
            "recall": 0.9503105590062112,
            "f1_score": 0.93292682926829

#### more evaluation

In [19]:
# Get misclassified samples
misclassified = X_test[np.argmax(y_test, axis=1) != np.argmax(model.predict(X_test), axis=1)]
misclassified_labels = np.argmax(y_test, axis=1)[np.argmax(y_test, axis=1) != np.argmax(model.predict(X_test), axis=1)]
misclassified_preds = np.argmax(model.predict(X_test), axis=1)[np.argmax(y_test, axis=1) != np.argmax(model.predict(X_test), axis=1)]

# Create a DataFrame for easy analysis
df = pd.DataFrame({
    "Text": tokenizer.sequences_to_texts(misclassified),
    "True Label": [language_codes[i] for i in misclassified_labels],
    "Predicted Label": [language_codes[i] for i in misclassified_preds]
})

df.head(20)  # Show first 20 misclassified examples


[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step


Unnamed: 0,Text,True Label,Predicted Label
0,በ ብ ቃ ት የ ማ ይ መ ረ ቱ ም ር ቶ ች ላ ይ ከ ው ...,amh,ibo
1,<unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk...,ibo,pcm
2,u . n ' o t u a a t l e t i c o m a d ...,ibo,run
3,r a 1 9 7 0 k a n d h a l a t e d m x ...,orm,run
4,h a n n a a n d u b a r t i i s a n a w ...,orm,run
5,’ t b a c k d o w n ’ h a y y a m a ...,orm,run
6,o d a a a n e - a k k a d a n s a a d ...,orm,run
7,r o o w a l i i n m e e t i r a 2 a k ...,orm,run
8,s a g a l e e n h a t a m e e r a j e c h ...,orm,run
9,"a n i i f "" j e d h a n . s h o r o r k e ...",orm,run


#### Save model

In [20]:

def save_model(model, tokenizer, max_length, language_codes, output_dir="models"):
    """
    Save the trained model and necessary metadata.
    
    Args:
        model: Trained Keras model
        tokenizer: Fitted tokenizer
        max_length: Maximum sequence length
        language_codes: List of language codes
        output_dir: Directory to save model
    """
    # Create the directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Save the model
    model_path = os.path.join(output_dir, "language_identification_model.h5")
    model.save(model_path)
    
    # Save tokenizer vocabulary
    tokenizer_json = tokenizer.to_json()
    with open(os.path.join(output_dir, "tokenizer.json"), "w") as f:
        f.write(tokenizer_json)
    
    # Save metadata (max_length, language codes)
    metadata = {
        "max_length": max_length,
        "language_codes": language_codes
    }
    
    with open(os.path.join(output_dir, "metadata.json"), "w") as f:
        json.dump(metadata, f, indent=4)
    
    print(f"Model saved to {model_path}")
    print(f"Tokenizer and metadata saved to {output_dir}")

In [21]:
save_model(model, tokenizer, max_length, language_codes, output_dir="models")



Model saved to models\language_identification_model.h5
Tokenizer and metadata saved to models
