<a href="https://colab.research.google.com/github/its3alih/Thesis/blob/main/GRU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##FIRST

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.utils import to_categorical

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Build vocab
def build_vocab(sentences, tags):
    word2idx = {'<PAD>': 0, '<UNK>': 1}
    tag2idx = {}
    for sent in sentences:
        for word in sent:
            if word not in word2idx:
                word2idx[word] = len(word2idx)
    for tag_seq in tags:
        for tag in tag_seq:
            if tag not in tag2idx:
                tag2idx[tag] = len(tag2idx)
    idx2tag = {i: t for t, i in tag2idx.items()}
    return word2idx, tag2idx, idx2tag

# 3. Encode and pad
def encode(sentences, tags, word2idx, tag2idx, max_len=50):
    X, y = [], []
    for sent, label_seq in zip(sentences, tags):
        word_ids = [word2idx.get(w, word2idx['<UNK>']) for w in sent]
        label_ids = [tag2idx[l] for l in label_seq]
        while len(word_ids) < max_len:
            word_ids.append(word2idx['<PAD>'])
            label_ids.append(tag2idx['O'])  # Default tag
        X.append(word_ids[:max_len])
        y.append(label_ids[:max_len])
    return np.array(X), np.array(y)

# 4. Build GRU model
def build_model(vocab_size, tag_size, max_len):
    input = tf.keras.Input(shape=(max_len,))
    x = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64)(input)  # Removed mask_zero=True
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x)
    x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tag_size, activation='softmax'))(x)
    model = tf.keras.Model(input, x)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model



# 5. Load and preprocess data
file_path = "/content/IO.xlsx"  # Update this path as needed
sentences, tags = load_excel_data(file_path)
word2idx, tag2idx, idx2tag = build_vocab(sentences, tags)
X, y = encode(sentences, tags, word2idx, tag2idx)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train model
model = build_model(len(word2idx), len(tag2idx), max_len=50)
y_train_cat = to_categorical(y_train, num_classes=len(tag2idx))
y_test_cat = to_categorical(y_test, num_classes=len(tag2idx))
model.fit(X_train, y_train_cat, batch_size=32, epochs=5, validation_split=0.1)

# 7. Predict and evaluate
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=-1)
y_true_labels = y_test

# 8. Flatten and convert to tag strings
y_pred_flat = []
y_true_flat = []
for i in range(len(y_true_labels)):
    for j in range(len(y_true_labels[i])):
        if X_test[i][j] != word2idx['<PAD>']:
            y_pred_flat.append(idx2tag[y_pred_labels[i][j]])
            y_true_flat.append(idx2tag[y_true_labels[i][j]])

# 9. Report
print("Classification Report:")
print(classification_report(y_true_flat, y_pred_flat))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_true_flat, y_pred_flat):.4f}")
print(f"Precision: {precision_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")


Epoch 1/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 204ms/step - accuracy: 0.9405 - loss: 0.1728 - val_accuracy: 0.9946 - val_loss: 0.0198
Epoch 2/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 98ms/step - accuracy: 0.9954 - loss: 0.0166 - val_accuracy: 0.9975 - val_loss: 0.0106
Epoch 3/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 99ms/step - accuracy: 0.9979 - loss: 0.0084 - val_accuracy: 0.9977 - val_loss: 0.0072
Epoch 4/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 80ms/step - accuracy: 0.9987 - loss: 0.0049 - val_accuracy: 0.9985 - val_loss: 0.0058
Epoch 5/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 81ms/step - accuracy: 0.9989 - loss: 0.0036 - val_accuracy: 0.9984 - val_loss: 0.0044
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step
Classification Report:
              precision    recall  f1-score   support

           I       0.96      0.95      0.96

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.utils import to_categorical

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Build vocab
def build_vocab(sentences, tags):
    word2idx = {'<PAD>': 0, '<UNK>': 1}
    tag2idx = {}
    for sent in sentences:
        for word in sent:
            if word not in word2idx:
                word2idx[word] = len(word2idx)
    for tag_seq in tags:
        for tag in tag_seq:
            if tag not in tag2idx:
                tag2idx[tag] = len(tag2idx)
    if 'O' not in tag2idx:
        tag2idx['O'] = len(tag2idx)
    idx2tag = {i: t for t, i in tag2idx.items()}
    return word2idx, tag2idx, idx2tag

# 3. Encode and pad
def encode(sentences, tags, word2idx, tag2idx, max_len=50):
    X, y = [], []
    for sent, label_seq in zip(sentences, tags):
        word_ids = [word2idx.get(w, word2idx['<UNK>']) for w in sent]
        label_ids = [tag2idx[l] for l in label_seq]
        while len(word_ids) < max_len:
            word_ids.append(word2idx['<PAD>'])
            label_ids.append(tag2idx['O'])  # Default tag
        X.append(word_ids[:max_len])
        y.append(label_ids[:max_len])
    return np.array(X), np.array(y)

# 4. Build GRU model
def build_model(vocab_size, tag_size, max_len):
    input = tf.keras.Input(shape=(max_len,))
    x = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64)(input)  # No mask_zero
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x)
    x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tag_size, activation='softmax'))(x)
    model = tf.keras.Model(input, x)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# 5. Load and preprocess data
file_path = "/content/BI.xlsx"  # Update this path as needed
sentences, tags = load_excel_data(file_path)
word2idx, tag2idx, idx2tag = build_vocab(sentences, tags)
X, y = encode(sentences, tags, word2idx, tag2idx)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train model
model = build_model(len(word2idx), len(tag2idx), max_len=50)
y_train_cat = to_categorical(y_train, num_classes=len(tag2idx))
y_test_cat = to_categorical(y_test, num_classes=len(tag2idx))
model.fit(X_train, y_train_cat, batch_size=32, epochs=5, validation_split=0.1)

# 7. Predict and evaluate
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=-1)
y_true_labels = y_test

# 8. Flatten and convert to tag strings
y_pred_flat = []
y_true_flat = []
for i in range(len(y_true_labels)):
    for j in range(len(y_true_labels[i])):
        if X_test[i][j] != word2idx['<PAD>']:
            y_pred_flat.append(idx2tag[y_pred_labels[i][j]])
            y_true_flat.append(idx2tag[y_true_labels[i][j]])

# 9. Report
print("Classification Report:")
print(classification_report(y_true_flat, y_pred_flat))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_true_flat, y_pred_flat):.4f}")
print(f"Precision: {precision_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")


Epoch 1/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 97ms/step - accuracy: 0.8374 - loss: 0.5172 - val_accuracy: 0.9809 - val_loss: 0.0653
Epoch 2/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 96ms/step - accuracy: 0.9863 - loss: 0.0526 - val_accuracy: 0.9943 - val_loss: 0.0253
Epoch 3/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 101ms/step - accuracy: 0.9955 - loss: 0.0193 - val_accuracy: 0.9965 - val_loss: 0.0151
Epoch 4/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 86ms/step - accuracy: 0.9971 - loss: 0.0122 - val_accuracy: 0.9973 - val_loss: 0.0101
Epoch 5/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 83ms/step - accuracy: 0.9976 - loss: 0.0093 - val_accuracy: 0.9982 - val_loss: 0.0075
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 68ms/step
Classification Report:
              precision    recall  f1-score   support

           B       0.96      0.92      0.94 

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.utils import to_categorical

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels
# 2. Build vocab
def build_vocab(sentences, tags):
    word2idx = {'<PAD>': 0, '<UNK>': 1}
    tag2idx = {}
    for sent in sentences:
        for word in sent:
            if word not in word2idx:
                word2idx[word] = len(word2idx)
    for tag_seq in tags:
        for tag in tag_seq:
            if tag not in tag2idx:
                tag2idx[tag] = len(tag2idx)
    if 'O' not in tag2idx:
        tag2idx['O'] = len(tag2idx)
    idx2tag = {i: t for t, i in tag2idx.items()}
    return word2idx, tag2idx, idx2tag

# 3. Encode and pad
def encode(sentences, tags, word2idx, tag2idx, max_len=50):
    X, y = [], []
    for sent, label_seq in zip(sentences, tags):
        word_ids = [word2idx.get(w, word2idx['<UNK>']) for w in sent]
        label_ids = [tag2idx[l] for l in label_seq]
        while len(word_ids) < max_len:
            word_ids.append(word2idx['<PAD>'])
            label_ids.append(tag2idx['O'])  # Default tag
        X.append(word_ids[:max_len])
        y.append(label_ids[:max_len])
    return np.array(X), np.array(y)

# 4. Build GRU model
def build_model(vocab_size, tag_size, max_len):
    input = tf.keras.Input(shape=(max_len,))
    x = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64)(input)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x)
    x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tag_size, activation='softmax'))(x)
    model = tf.keras.Model(input, x)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# 5. Load and preprocess data
file_path = "/content/BIES.xlsx"  # Update this path to your BIES-tagged file
sentences, tags = load_excel_data(file_path)
word2idx, tag2idx, idx2tag = build_vocab(sentences, tags)
X, y = encode(sentences, tags, word2idx, tag2idx)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train model
model = build_model(len(word2idx), len(tag2idx), max_len=50)
y_train_cat = to_categorical(y_train, num_classes=len(tag2idx))
y_test_cat = to_categorical(y_test, num_classes=len(tag2idx))
model.fit(X_train, y_train_cat, batch_size=32, epochs=5, validation_split=0.1)

# 7. Predict and evaluate
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=-1)
y_true_labels = y_test

# 8. Flatten and convert to tag strings
y_pred_flat = []
y_true_flat = []
for i in range(len(y_true_labels)):
    for j in range(len(y_true_labels[i])):
        if X_test[i][j] != word2idx['<PAD>']:
            y_pred_flat.append(idx2tag[y_pred_labels[i][j]])
            y_true_flat.append(idx2tag[y_true_labels[i][j]])

# 9. Report
print("Classification Report:")
print(classification_report(y_true_flat, y_pred_flat))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_true_flat, y_pred_flat):.4f}")
print(f"Precision: {precision_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")


Epoch 1/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 120ms/step - accuracy: 0.8021 - loss: 0.7768 - val_accuracy: 0.9751 - val_loss: 0.0973
Epoch 2/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 89ms/step - accuracy: 0.9808 - loss: 0.0782 - val_accuracy: 0.9894 - val_loss: 0.0472
Epoch 3/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 90ms/step - accuracy: 0.9900 - loss: 0.0430 - val_accuracy: 0.9936 - val_loss: 0.0290
Epoch 4/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 103ms/step - accuracy: 0.9938 - loss: 0.0259 - val_accuracy: 0.9951 - val_loss: 0.0203
Epoch 5/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 103ms/step - accuracy: 0.9957 - loss: 0.0164 - val_accuracy: 0.9965 - val_loss: 0.0151
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step
Classification Report:
              precision    recall  f1-score   support

           B       0.95      0.95      0.9

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.utils import to_categorical

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Build vocab
def build_vocab(sentences, tags):
    word2idx = {'<PAD>': 0, '<UNK>': 1}
    tag2idx = {}
    for sent in sentences:
        for word in sent:
            if word not in word2idx:
                word2idx[word] = len(word2idx)
    for tag_seq in tags:
        for tag in tag_seq:
            if tag not in tag2idx:
                tag2idx[tag] = len(tag2idx)
    if 'O' not in tag2idx:
        tag2idx['O'] = len(tag2idx)
    idx2tag = {i: t for t, i in tag2idx.items()}
    return word2idx, tag2idx, idx2tag

# 3. Encode and pad
def encode(sentences, tags, word2idx, tag2idx, max_len=50):
    X, y = [], []
    for sent, label_seq in zip(sentences, tags):
        word_ids = [word2idx.get(w, word2idx['<UNK>']) for w in sent]
        label_ids = [tag2idx[l] for l in label_seq]
        while len(word_ids) < max_len:
            word_ids.append(word2idx['<PAD>'])
            label_ids.append(tag2idx['O'])  # Default tag
        X.append(word_ids[:max_len])
        y.append(label_ids[:max_len])
    return np.array(X), np.array(y)

# 4. Build GRU model
def build_model(vocab_size, tag_size, max_len):
    input = tf.keras.Input(shape=(max_len,))
    x = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64)(input)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x)
    x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tag_size, activation='softmax'))(x)
    model = tf.keras.Model(input, x)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# 5. Load and preprocess data
file_path = "/content/IE.xlsx"  # Update this path to your IE-tagged file
sentences, tags = load_excel_data(file_path)
word2idx, tag2idx, idx2tag = build_vocab(sentences, tags)
X, y = encode(sentences, tags, word2idx, tag2idx)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train model
model = build_model(len(word2idx), len(tag2idx), max_len=50)
y_train_cat = to_categorical(y_train, num_classes=len(tag2idx))
y_test_cat = to_categorical(y_test, num_classes=len(tag2idx))
model.fit(X_train, y_train_cat, batch_size=32, epochs=5, validation_split=0.1)

# 7. Predict and evaluate
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=-1)
y_true_labels = y_test
# 8. Flatten and convert to tag strings
y_pred_flat = []
y_true_flat = []
for i in range(len(y_true_labels)):
    for j in range(len(y_true_labels[i])):
        if X_test[i][j] != word2idx['<PAD>']:
            y_pred_flat.append(idx2tag[y_pred_labels[i][j]])
            y_true_flat.append(idx2tag[y_true_labels[i][j]])

# 9. Report
print("Classification Report:")
print(classification_report(y_true_flat, y_pred_flat))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_true_flat, y_pred_flat):.4f}")
print(f"Precision: {precision_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")


Epoch 1/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 103ms/step - accuracy: 0.8519 - loss: 0.5325 - val_accuracy: 0.9829 - val_loss: 0.0658
Epoch 2/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 89ms/step - accuracy: 0.9853 - loss: 0.0539 - val_accuracy: 0.9927 - val_loss: 0.0300
Epoch 3/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 99ms/step - accuracy: 0.9939 - loss: 0.0253 - val_accuracy: 0.9952 - val_loss: 0.0184
Epoch 4/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 99ms/step - accuracy: 0.9961 - loss: 0.0144 - val_accuracy: 0.9969 - val_loss: 0.0122
Epoch 5/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 81ms/step - accuracy: 0.9980 - loss: 0.0082 - val_accuracy: 0.9978 - val_loss: 0.0081
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step
Classification Report:
              precision    recall  f1-score   support

           E       0.92      0.93      0.93 

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.utils import to_categorical

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Build vocab
def build_vocab(sentences, tags):
    word2idx = {'<PAD>': 0, '<UNK>': 1}
    tag2idx = {}
    for sent in sentences:
        for word in sent:
            if word not in word2idx:
                word2idx[word] = len(word2idx)
    for tag_seq in tags:
        for tag in tag_seq:
            if tag not in tag2idx:
                tag2idx[tag] = len(tag2idx)
    if 'O' not in tag2idx:
        tag2idx['O'] = len(tag2idx)
    idx2tag = {i: t for t, i in tag2idx.items()}
    return word2idx, tag2idx, idx2tag

# 3. Encode and pad
def encode(sentences, tags, word2idx, tag2idx, max_len=50):
    X, y = [], []
    for sent, label_seq in zip(sentences, tags):
        word_ids = [word2idx.get(w, word2idx['<UNK>']) for w in sent]
        label_ids = [tag2idx[l] for l in label_seq]
        while len(word_ids) < max_len:
            word_ids.append(word2idx['<PAD>'])
            label_ids.append(tag2idx['O'])  # Default tag
        X.append(word_ids[:max_len])
        y.append(label_ids[:max_len])
    return np.array(X), np.array(y)

# 4. Build GRU model
def build_model(vocab_size, tag_size, max_len):
    input = tf.keras.Input(shape=(max_len,))
    x = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64)(input)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x)
    x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tag_size, activation='softmax'))(x)
    model = tf.keras.Model(input, x)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# 5. Load and preprocess data
file_path = "/content/IOB.xlsx"  # Update this path to your IOB-tagged file
sentences, tags = load_excel_data(file_path)
word2idx, tag2idx, idx2tag = build_vocab(sentences, tags)
X, y = encode(sentences, tags, word2idx, tag2idx)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train model
model = build_model(len(word2idx), len(tag2idx), max_len=50)
y_train_cat = to_categorical(y_train, num_classes=len(tag2idx))
y_test_cat = to_categorical(y_test, num_classes=len(tag2idx))
model.fit(X_train, y_train_cat, batch_size=32, epochs=5, validation_split=0.1)

# 7. Predict and evaluate
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=-1)
y_true_labels = y_test

# 8. Flatten and convert to tag strings
y_pred_flat = []
y_true_flat = []
for i in range(len(y_true_labels)):
    for j in range(len(y_true_labels[i])):
        if X_test[i][j] != word2idx['<PAD>']:
            y_pred_flat.append(idx2tag[y_pred_labels[i][j]])
            y_true_flat.append(idx2tag[y_true_labels[i][j]])

# 9. Report
print("Classification Report:")
print(classification_report(y_true_flat, y_pred_flat))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_true_flat, y_pred_flat):.4f}")
print(f"Precision: {precision_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")


Epoch 1/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 112ms/step - accuracy: 0.9305 - loss: 0.2910 - val_accuracy: 0.9915 - val_loss: 0.0304
Epoch 2/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 137ms/step - accuracy: 0.9945 - loss: 0.0258 - val_accuracy: 0.9963 - val_loss: 0.0143
Epoch 3/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 108ms/step - accuracy: 0.9972 - loss: 0.0115 - val_accuracy: 0.9979 - val_loss: 0.0086
Epoch 4/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 109ms/step - accuracy: 0.9983 - loss: 0.0062 - val_accuracy: 0.9985 - val_loss: 0.0068
Epoch 5/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 97ms/step - accuracy: 0.9988 - loss: 0.0043 - val_accuracy: 0.9987 - val_loss: 0.0048
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 57ms/step
Classification Report:
              precision    recall  f1-score   support

           B       0.97      0.96      0

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.utils import to_categorical

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Build vocab
def build_vocab(sentences, tags):
    word2idx = {'<PAD>': 0, '<UNK>': 1}
    tag2idx = {}
    for sent in sentences:
        for word in sent:
            if word not in word2idx:
                word2idx[word] = len(word2idx)
    for tag_seq in tags:
        for tag in tag_seq:
            if tag not in tag2idx:
                tag2idx[tag] = len(tag2idx)
    if 'O' not in tag2idx:
        tag2idx['O'] = len(tag2idx)
    idx2tag = {i: t for t, i in tag2idx.items()}
    return word2idx, tag2idx, idx2tag

# 3. Encode and pad
def encode(sentences, tags, word2idx, tag2idx, max_len=50):
    X, y = [], []
    for sent, label_seq in zip(sentences, tags):
        word_ids = [word2idx.get(w, word2idx['<UNK>']) for w in sent]
        label_ids = [tag2idx[l] for l in label_seq]
        while len(word_ids) < max_len:
            word_ids.append(word2idx['<PAD>'])
            label_ids.append(tag2idx['O'])  # Default tag
        X.append(word_ids[:max_len])
        y.append(label_ids[:max_len])
    return np.array(X), np.array(y)

# 4. Build GRU model
def build_model(vocab_size, tag_size, max_len):
    input = tf.keras.Input(shape=(max_len,))
    x = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64)(input)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x)
    x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tag_size, activation='softmax'))(x)
    model = tf.keras.Model(input, x)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# 5. Load and preprocess data
file_path = "/content/IOBES.xlsx"  # Update this path to your IOBES-tagged file
sentences, tags = load_excel_data(file_path)
word2idx, tag2idx, idx2tag = build_vocab(sentences, tags)
X, y = encode(sentences, tags, word2idx, tag2idx)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train model
model = build_model(len(word2idx), len(tag2idx), max_len=50)
y_train_cat = to_categorical(y_train, num_classes=len(tag2idx))
y_test_cat = to_categorical(y_test, num_classes=len(tag2idx))
model.fit(X_train, y_train_cat, batch_size=32, epochs=5, validation_split=0.1)

# 7. Predict and evaluate
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=-1)
y_true_labels = y_test

# 8. Flatten and convert to tag strings
y_pred_flat = []
y_true_flat = []
for i in range(len(y_true_labels)):
    for j in range(len(y_true_labels[i])):
        if X_test[i][j] != word2idx['<PAD>']:
            y_pred_flat.append(idx2tag[y_pred_labels[i][j]])
            y_true_flat.append(idx2tag[y_true_labels[i][j]])

# 9. Report
print("Classification Report:")
print(classification_report(y_true_flat, y_pred_flat))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_true_flat, y_pred_flat):.4f}")
print(f"Precision: {precision_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")


Epoch 1/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 114ms/step - accuracy: 0.9271 - loss: 0.4654 - val_accuracy: 0.9897 - val_loss: 0.0423
Epoch 2/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 102ms/step - accuracy: 0.9912 - loss: 0.0347 - val_accuracy: 0.9956 - val_loss: 0.0206
Epoch 3/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 102ms/step - accuracy: 0.9961 - loss: 0.0165 - val_accuracy: 0.9966 - val_loss: 0.0130
Epoch 4/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 95ms/step - accuracy: 0.9967 - loss: 0.0113 - val_accuracy: 0.9968 - val_loss: 0.0096
Epoch 5/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 84ms/step - accuracy: 0.9975 - loss: 0.0083 - val_accuracy: 0.9981 - val_loss: 0.0071
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step
Classification Report:
              precision    recall  f1-score   support

           B       0.95      0.92      0.9

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.utils import to_categorical

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Build vocab
def build_vocab(sentences, tags):
    word2idx = {'<PAD>': 0, '<UNK>': 1}
    tag2idx = {}
    for sent in sentences:
        for word in sent:
            if word not in word2idx:
                word2idx[word] = len(word2idx)
    for tag_seq in tags:
        for tag in tag_seq:
            if tag not in tag2idx:
                tag2idx[tag] = len(tag2idx)
    if 'O' not in tag2idx:
        tag2idx['O'] = len(tag2idx)
    idx2tag = {i: t for t, i in tag2idx.items()}
    return word2idx, tag2idx, idx2tag

# 3. Encode and pad
def encode(sentences, tags, word2idx, tag2idx, max_len=50):
    X, y = [], []
    for sent, label_seq in zip(sentences, tags):
        word_ids = [word2idx.get(w, word2idx['<UNK>']) for w in sent]
        label_ids = [tag2idx[l] for l in label_seq]
        while len(word_ids) < max_len:
            word_ids.append(word2idx['<PAD>'])
            label_ids.append(tag2idx['O'])  # Default tag
        X.append(word_ids[:max_len])
        y.append(label_ids[:max_len])
    return np.array(X), np.array(y)

# 4. Build GRU model
def build_model(vocab_size, tag_size, max_len):
    input = tf.keras.Input(shape=(max_len,))
    x = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64)(input)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x)
    x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tag_size, activation='softmax'))(x)
    model = tf.keras.Model(input, x)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# 5. Load and preprocess data
file_path = "/content/IOE.xlsx"  # Update this path to your IOE-tagged file
sentences, tags = load_excel_data(file_path)
word2idx, tag2idx, idx2tag = build_vocab(sentences, tags)
X, y = encode(sentences, tags, word2idx, tag2idx)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train model
model = build_model(len(word2idx), len(tag2idx), max_len=50)
y_train_cat = to_categorical(y_train, num_classes=len(tag2idx))
y_test_cat = to_categorical(y_test, num_classes=len(tag2idx))
model.fit(X_train, y_train_cat, batch_size=32, epochs=5, validation_split=0.1)

# 7. Predict and evaluate
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=-1)
y_true_labels = y_test

# 8. Flatten and convert to tag strings
y_pred_flat = []
y_true_flat = []
for i in range(len(y_true_labels)):
    for j in range(len(y_true_labels[i])):
        if X_test[i][j] != word2idx['<PAD>']:
            y_pred_flat.append(idx2tag[y_pred_labels[i][j]])
            y_true_flat.append(idx2tag[y_true_labels[i][j]])

# 9. Report
print("Classification Report:")
print(classification_report(y_true_flat, y_pred_flat))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_true_flat, y_pred_flat):.4f}")
print(f"Precision: {precision_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")


Epoch 1/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 125ms/step - accuracy: 0.9330 - loss: 0.2993 - val_accuracy: 0.9906 - val_loss: 0.0319
Epoch 2/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 110ms/step - accuracy: 0.9927 - loss: 0.0263 - val_accuracy: 0.9957 - val_loss: 0.0164
Epoch 3/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 97ms/step - accuracy: 0.9960 - loss: 0.0136 - val_accuracy: 0.9970 - val_loss: 0.0106
Epoch 4/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 100ms/step - accuracy: 0.9973 - loss: 0.0089 - val_accuracy: 0.9981 - val_loss: 0.0073
Epoch 5/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 112ms/step - accuracy: 0.9981 - loss: 0.0065 - val_accuracy: 0.9986 - val_loss: 0.0053
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 57ms/step
Classification Report:
              precision    recall  f1-score   support

           E       0.95      0.93      0

##SECOND

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.utils import to_categorical

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Build vocab
def build_vocab(sentences, tags):
    word2idx = {'<PAD>': 0, '<UNK>': 1}
    tag2idx = {}
    for sent in sentences:
        for word in sent:
            if word not in word2idx:
                word2idx[word] = len(word2idx)
    for tag_seq in tags:
        for tag in tag_seq:
            if tag not in tag2idx:
                tag2idx[tag] = len(tag2idx)
    idx2tag = {i: t for t, i in tag2idx.items()}
    return word2idx, tag2idx, idx2tag

# 3. Encode and pad
def encode(sentences, tags, word2idx, tag2idx, max_len=50):
    X, y = [], []
    for sent, label_seq in zip(sentences, tags):
        word_ids = [word2idx.get(w, word2idx['<UNK>']) for w in sent]
        label_ids = [tag2idx[l] for l in label_seq]
        while len(word_ids) < max_len:
            word_ids.append(word2idx['<PAD>'])
            label_ids.append(tag2idx['O'])  # Default tag
        X.append(word_ids[:max_len])
        y.append(label_ids[:max_len])
    return np.array(X), np.array(y)

# 4. Build GRU model
def build_model(vocab_size, tag_size, max_len):
    input = tf.keras.Input(shape=(max_len,))
    x = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64)(input)  # Removed mask_zero=True
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x)
    x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tag_size, activation='softmax'))(x)
    model = tf.keras.Model(input, x)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model



# 5. Load and preprocess data
file_path = "/content/IO.xlsx"  # Update this path as needed
sentences, tags = load_excel_data(file_path)
word2idx, tag2idx, idx2tag = build_vocab(sentences, tags)
X, y = encode(sentences, tags, word2idx, tag2idx)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train model
model = build_model(len(word2idx), len(tag2idx), max_len=50)
y_train_cat = to_categorical(y_train, num_classes=len(tag2idx))
y_test_cat = to_categorical(y_test, num_classes=len(tag2idx))
model.fit(X_train, y_train_cat, batch_size=32, epochs=5, validation_split=0.1)

# 7. Predict and evaluate
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=-1)
y_true_labels = y_test

# 8. Flatten and convert to tag strings
y_pred_flat = []
y_true_flat = []
for i in range(len(y_true_labels)):
    for j in range(len(y_true_labels[i])):
        if X_test[i][j] != word2idx['<PAD>']:
            y_pred_flat.append(idx2tag[y_pred_labels[i][j]])
            y_true_flat.append(idx2tag[y_true_labels[i][j]])

# 9. Report
print("Classification Report:")
print(classification_report(y_true_flat, y_pred_flat))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_true_flat, y_pred_flat):.4f}")
print(f"Precision: {precision_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")


Epoch 1/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 124ms/step - accuracy: 0.9376 - loss: 0.1799 - val_accuracy: 0.9956 - val_loss: 0.0184
Epoch 2/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 98ms/step - accuracy: 0.9957 - loss: 0.0168 - val_accuracy: 0.9976 - val_loss: 0.0099
Epoch 3/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 123ms/step - accuracy: 0.9977 - loss: 0.0086 - val_accuracy: 0.9980 - val_loss: 0.0068
Epoch 4/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 112ms/step - accuracy: 0.9985 - loss: 0.0052 - val_accuracy: 0.9983 - val_loss: 0.0051
Epoch 5/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 106ms/step - accuracy: 0.9989 - loss: 0.0040 - val_accuracy: 0.9987 - val_loss: 0.0043
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 54ms/step
Classification Report:
              precision    recall  f1-score   support

           I       0.98      0.94      0

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.utils import to_categorical

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Build vocab
def build_vocab(sentences, tags):
    word2idx = {'<PAD>': 0, '<UNK>': 1}
    tag2idx = {}
    for sent in sentences:
        for word in sent:
            if word not in word2idx:
                word2idx[word] = len(word2idx)
    for tag_seq in tags:
        for tag in tag_seq:
            if tag not in tag2idx:
                tag2idx[tag] = len(tag2idx)
    if 'O' not in tag2idx:
        tag2idx['O'] = len(tag2idx)
    idx2tag = {i: t for t, i in tag2idx.items()}
    return word2idx, tag2idx, idx2tag

# 3. Encode and pad
def encode(sentences, tags, word2idx, tag2idx, max_len=50):
    X, y = [], []
    for sent, label_seq in zip(sentences, tags):
        word_ids = [word2idx.get(w, word2idx['<UNK>']) for w in sent]
        label_ids = [tag2idx[l] for l in label_seq]
        while len(word_ids) < max_len:
            word_ids.append(word2idx['<PAD>'])
            label_ids.append(tag2idx['O'])  # Default tag
        X.append(word_ids[:max_len])
        y.append(label_ids[:max_len])
    return np.array(X), np.array(y)

# 4. Build GRU model
def build_model(vocab_size, tag_size, max_len):
    input = tf.keras.Input(shape=(max_len,))
    x = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64)(input)  # No mask_zero
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x)
    x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tag_size, activation='softmax'))(x)
    model = tf.keras.Model(input, x)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# 5. Load and preprocess data
file_path = "/content/BI.xlsx"  # Update this path as needed
sentences, tags = load_excel_data(file_path)
word2idx, tag2idx, idx2tag = build_vocab(sentences, tags)
X, y = encode(sentences, tags, word2idx, tag2idx)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train model
model = build_model(len(word2idx), len(tag2idx), max_len=50)
y_train_cat = to_categorical(y_train, num_classes=len(tag2idx))
y_test_cat = to_categorical(y_test, num_classes=len(tag2idx))
model.fit(X_train, y_train_cat, batch_size=32, epochs=5, validation_split=0.1)

# 7. Predict and evaluate
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=-1)
y_true_labels = y_test

# 8. Flatten and convert to tag strings
y_pred_flat = []
y_true_flat = []
for i in range(len(y_true_labels)):
    for j in range(len(y_true_labels[i])):
        if X_test[i][j] != word2idx['<PAD>']:
            y_pred_flat.append(idx2tag[y_pred_labels[i][j]])
            y_true_flat.append(idx2tag[y_true_labels[i][j]])

# 9. Report
print("Classification Report:")
print(classification_report(y_true_flat, y_pred_flat))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_true_flat, y_pred_flat):.4f}")
print(f"Precision: {precision_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")


Epoch 1/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 108ms/step - accuracy: 0.8199 - loss: 0.5429 - val_accuracy: 0.9846 - val_loss: 0.0606
Epoch 2/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 101ms/step - accuracy: 0.9862 - loss: 0.0523 - val_accuracy: 0.9948 - val_loss: 0.0217
Epoch 3/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 101ms/step - accuracy: 0.9952 - loss: 0.0199 - val_accuracy: 0.9967 - val_loss: 0.0130
Epoch 4/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 86ms/step - accuracy: 0.9977 - loss: 0.0092 - val_accuracy: 0.9977 - val_loss: 0.0084
Epoch 5/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 93ms/step - accuracy: 0.9983 - loss: 0.0067 - val_accuracy: 0.9983 - val_loss: 0.0063
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step
Classification Report:
              precision    recall  f1-score   support

           B       0.98      0.92      0.

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.utils import to_categorical

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels
# 2. Build vocab
def build_vocab(sentences, tags):
    word2idx = {'<PAD>': 0, '<UNK>': 1}
    tag2idx = {}
    for sent in sentences:
        for word in sent:
            if word not in word2idx:
                word2idx[word] = len(word2idx)
    for tag_seq in tags:
        for tag in tag_seq:
            if tag not in tag2idx:
                tag2idx[tag] = len(tag2idx)
    if 'O' not in tag2idx:
        tag2idx['O'] = len(tag2idx)
    idx2tag = {i: t for t, i in tag2idx.items()}
    return word2idx, tag2idx, idx2tag

# 3. Encode and pad
def encode(sentences, tags, word2idx, tag2idx, max_len=50):
    X, y = [], []
    for sent, label_seq in zip(sentences, tags):
        word_ids = [word2idx.get(w, word2idx['<UNK>']) for w in sent]
        label_ids = [tag2idx[l] for l in label_seq]
        while len(word_ids) < max_len:
            word_ids.append(word2idx['<PAD>'])
            label_ids.append(tag2idx['O'])  # Default tag
        X.append(word_ids[:max_len])
        y.append(label_ids[:max_len])
    return np.array(X), np.array(y)

# 4. Build GRU model
def build_model(vocab_size, tag_size, max_len):
    input = tf.keras.Input(shape=(max_len,))
    x = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64)(input)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x)
    x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tag_size, activation='softmax'))(x)
    model = tf.keras.Model(input, x)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# 5. Load and preprocess data
file_path = "/content/BIES.xlsx"  # Update this path to your BIES-tagged file
sentences, tags = load_excel_data(file_path)
word2idx, tag2idx, idx2tag = build_vocab(sentences, tags)
X, y = encode(sentences, tags, word2idx, tag2idx)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train model
model = build_model(len(word2idx), len(tag2idx), max_len=50)
y_train_cat = to_categorical(y_train, num_classes=len(tag2idx))
y_test_cat = to_categorical(y_test, num_classes=len(tag2idx))
model.fit(X_train, y_train_cat, batch_size=32, epochs=5, validation_split=0.1)

# 7. Predict and evaluate
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=-1)
y_true_labels = y_test

# 8. Flatten and convert to tag strings
y_pred_flat = []
y_true_flat = []
for i in range(len(y_true_labels)):
    for j in range(len(y_true_labels[i])):
        if X_test[i][j] != word2idx['<PAD>']:
            y_pred_flat.append(idx2tag[y_pred_labels[i][j]])
            y_true_flat.append(idx2tag[y_true_labels[i][j]])

# 9. Report
print("Classification Report:")
print(classification_report(y_true_flat, y_pred_flat))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_true_flat, y_pred_flat):.4f}")
print(f"Precision: {precision_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")


Epoch 1/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 151ms/step - accuracy: 0.8041 - loss: 0.7826 - val_accuracy: 0.9746 - val_loss: 0.1034
Epoch 2/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 98ms/step - accuracy: 0.9775 - loss: 0.0908 - val_accuracy: 0.9894 - val_loss: 0.0426
Epoch 3/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 119ms/step - accuracy: 0.9910 - loss: 0.0392 - val_accuracy: 0.9944 - val_loss: 0.0243
Epoch 4/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 119ms/step - accuracy: 0.9942 - loss: 0.0219 - val_accuracy: 0.9958 - val_loss: 0.0174
Epoch 5/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 139ms/step - accuracy: 0.9968 - loss: 0.0126 - val_accuracy: 0.9971 - val_loss: 0.0123
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 53ms/step
Classification Report:
              precision    recall  f1-score   support

           B       0.97      0.95      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.utils import to_categorical

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Build vocab
def build_vocab(sentences, tags):
    word2idx = {'<PAD>': 0, '<UNK>': 1}
    tag2idx = {}
    for sent in sentences:
        for word in sent:
            if word not in word2idx:
                word2idx[word] = len(word2idx)
    for tag_seq in tags:
        for tag in tag_seq:
            if tag not in tag2idx:
                tag2idx[tag] = len(tag2idx)
    if 'O' not in tag2idx:
        tag2idx['O'] = len(tag2idx)
    idx2tag = {i: t for t, i in tag2idx.items()}
    return word2idx, tag2idx, idx2tag

# 3. Encode and pad
def encode(sentences, tags, word2idx, tag2idx, max_len=50):
    X, y = [], []
    for sent, label_seq in zip(sentences, tags):
        word_ids = [word2idx.get(w, word2idx['<UNK>']) for w in sent]
        label_ids = [tag2idx[l] for l in label_seq]
        while len(word_ids) < max_len:
            word_ids.append(word2idx['<PAD>'])
            label_ids.append(tag2idx['O'])  # Default tag
        X.append(word_ids[:max_len])
        y.append(label_ids[:max_len])
    return np.array(X), np.array(y)

# 4. Build GRU model
def build_model(vocab_size, tag_size, max_len):
    input = tf.keras.Input(shape=(max_len,))
    x = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64)(input)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x)
    x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tag_size, activation='softmax'))(x)
    model = tf.keras.Model(input, x)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# 5. Load and preprocess data
file_path = "/content/IE.xlsx"  # Update this path to your IE-tagged file
sentences, tags = load_excel_data(file_path)
word2idx, tag2idx, idx2tag = build_vocab(sentences, tags)
X, y = encode(sentences, tags, word2idx, tag2idx)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train model
model = build_model(len(word2idx), len(tag2idx), max_len=50)
y_train_cat = to_categorical(y_train, num_classes=len(tag2idx))
y_test_cat = to_categorical(y_test, num_classes=len(tag2idx))
model.fit(X_train, y_train_cat, batch_size=32, epochs=5, validation_split=0.1)

# 7. Predict and evaluate
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=-1)
y_true_labels = y_test
# 8. Flatten and convert to tag strings
y_pred_flat = []
y_true_flat = []
for i in range(len(y_true_labels)):
    for j in range(len(y_true_labels[i])):
        if X_test[i][j] != word2idx['<PAD>']:
            y_pred_flat.append(idx2tag[y_pred_labels[i][j]])
            y_true_flat.append(idx2tag[y_true_labels[i][j]])

# 9. Report
print("Classification Report:")
print(classification_report(y_true_flat, y_pred_flat))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_true_flat, y_pred_flat):.4f}")
print(f"Precision: {precision_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")


Epoch 1/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 128ms/step - accuracy: 0.8237 - loss: 0.5469 - val_accuracy: 0.9844 - val_loss: 0.0664
Epoch 2/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 117ms/step - accuracy: 0.9837 - loss: 0.0596 - val_accuracy: 0.9934 - val_loss: 0.0277
Epoch 3/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 96ms/step - accuracy: 0.9936 - loss: 0.0261 - val_accuracy: 0.9954 - val_loss: 0.0185
Epoch 4/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 105ms/step - accuracy: 0.9961 - loss: 0.0150 - val_accuracy: 0.9971 - val_loss: 0.0120
Epoch 5/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 113ms/step - accuracy: 0.9980 - loss: 0.0094 - val_accuracy: 0.9979 - val_loss: 0.0085
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 56ms/step
Classification Report:
              precision    recall  f1-score   support

           E       0.95      0.92      0

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.utils import to_categorical

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Build vocab
def build_vocab(sentences, tags):
    word2idx = {'<PAD>': 0, '<UNK>': 1}
    tag2idx = {}
    for sent in sentences:
        for word in sent:
            if word not in word2idx:
                word2idx[word] = len(word2idx)
    for tag_seq in tags:
        for tag in tag_seq:
            if tag not in tag2idx:
                tag2idx[tag] = len(tag2idx)
    if 'O' not in tag2idx:
        tag2idx['O'] = len(tag2idx)
    idx2tag = {i: t for t, i in tag2idx.items()}
    return word2idx, tag2idx, idx2tag

# 3. Encode and pad
def encode(sentences, tags, word2idx, tag2idx, max_len=50):
    X, y = [], []
    for sent, label_seq in zip(sentences, tags):
        word_ids = [word2idx.get(w, word2idx['<UNK>']) for w in sent]
        label_ids = [tag2idx[l] for l in label_seq]
        while len(word_ids) < max_len:
            word_ids.append(word2idx['<PAD>'])
            label_ids.append(tag2idx['O'])  # Default tag
        X.append(word_ids[:max_len])
        y.append(label_ids[:max_len])
    return np.array(X), np.array(y)

# 4. Build GRU model
def build_model(vocab_size, tag_size, max_len):
    input = tf.keras.Input(shape=(max_len,))
    x = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64)(input)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x)
    x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tag_size, activation='softmax'))(x)
    model = tf.keras.Model(input, x)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# 5. Load and preprocess data
file_path = "/content/IOB.xlsx"  # Update this path to your IOB-tagged file
sentences, tags = load_excel_data(file_path)
word2idx, tag2idx, idx2tag = build_vocab(sentences, tags)
X, y = encode(sentences, tags, word2idx, tag2idx)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train model
model = build_model(len(word2idx), len(tag2idx), max_len=50)
y_train_cat = to_categorical(y_train, num_classes=len(tag2idx))
y_test_cat = to_categorical(y_test, num_classes=len(tag2idx))
model.fit(X_train, y_train_cat, batch_size=32, epochs=5, validation_split=0.1)

# 7. Predict and evaluate
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=-1)
y_true_labels = y_test

# 8. Flatten and convert to tag strings
y_pred_flat = []
y_true_flat = []
for i in range(len(y_true_labels)):
    for j in range(len(y_true_labels[i])):
        if X_test[i][j] != word2idx['<PAD>']:
            y_pred_flat.append(idx2tag[y_pred_labels[i][j]])
            y_true_flat.append(idx2tag[y_true_labels[i][j]])

# 9. Report
print("Classification Report:")
print(classification_report(y_true_flat, y_pred_flat))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_true_flat, y_pred_flat):.4f}")
print(f"Precision: {precision_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")


Epoch 1/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 125ms/step - accuracy: 0.9708 - loss: 0.2866 - val_accuracy: 0.9913 - val_loss: 0.0284
Epoch 2/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 113ms/step - accuracy: 0.9937 - loss: 0.0255 - val_accuracy: 0.9975 - val_loss: 0.0118
Epoch 3/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 107ms/step - accuracy: 0.9972 - loss: 0.0108 - val_accuracy: 0.9983 - val_loss: 0.0071
Epoch 4/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 112ms/step - accuracy: 0.9985 - loss: 0.0058 - val_accuracy: 0.9987 - val_loss: 0.0057
Epoch 5/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 124ms/step - accuracy: 0.9990 - loss: 0.0040 - val_accuracy: 0.9989 - val_loss: 0.0038
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 59ms/step
Classification Report:
              precision    recall  f1-score   support

           B       0.98      0.93      

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.utils import to_categorical

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Build vocab
def build_vocab(sentences, tags):
    word2idx = {'<PAD>': 0, '<UNK>': 1}
    tag2idx = {}
    for sent in sentences:
        for word in sent:
            if word not in word2idx:
                word2idx[word] = len(word2idx)
    for tag_seq in tags:
        for tag in tag_seq:
            if tag not in tag2idx:
                tag2idx[tag] = len(tag2idx)
    if 'O' not in tag2idx:
        tag2idx['O'] = len(tag2idx)
    idx2tag = {i: t for t, i in tag2idx.items()}
    return word2idx, tag2idx, idx2tag

# 3. Encode and pad
def encode(sentences, tags, word2idx, tag2idx, max_len=50):
    X, y = [], []
    for sent, label_seq in zip(sentences, tags):
        word_ids = [word2idx.get(w, word2idx['<UNK>']) for w in sent]
        label_ids = [tag2idx[l] for l in label_seq]
        while len(word_ids) < max_len:
            word_ids.append(word2idx['<PAD>'])
            label_ids.append(tag2idx['O'])  # Default tag
        X.append(word_ids[:max_len])
        y.append(label_ids[:max_len])
    return np.array(X), np.array(y)

# 4. Build GRU model
def build_model(vocab_size, tag_size, max_len):
    input = tf.keras.Input(shape=(max_len,))
    x = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64)(input)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x)
    x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tag_size, activation='softmax'))(x)
    model = tf.keras.Model(input, x)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# 5. Load and preprocess data
file_path = "/content/IOBES.xlsx"  # Update this path to your IOBES-tagged file
sentences, tags = load_excel_data(file_path)
word2idx, tag2idx, idx2tag = build_vocab(sentences, tags)
X, y = encode(sentences, tags, word2idx, tag2idx)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train model
model = build_model(len(word2idx), len(tag2idx), max_len=50)
y_train_cat = to_categorical(y_train, num_classes=len(tag2idx))
y_test_cat = to_categorical(y_test, num_classes=len(tag2idx))
model.fit(X_train, y_train_cat, batch_size=32, epochs=5, validation_split=0.1)

# 7. Predict and evaluate
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=-1)
y_true_labels = y_test

# 8. Flatten and convert to tag strings
y_pred_flat = []
y_true_flat = []
for i in range(len(y_true_labels)):
    for j in range(len(y_true_labels[i])):
        if X_test[i][j] != word2idx['<PAD>']:
            y_pred_flat.append(idx2tag[y_pred_labels[i][j]])
            y_true_flat.append(idx2tag[y_true_labels[i][j]])

# 9. Report
print("Classification Report:")
print(classification_report(y_true_flat, y_pred_flat))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_true_flat, y_pred_flat):.4f}")
print(f"Precision: {precision_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")


Epoch 1/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 124ms/step - accuracy: 0.9280 - loss: 0.4291 - val_accuracy: 0.9908 - val_loss: 0.0390
Epoch 2/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 109ms/step - accuracy: 0.9908 - loss: 0.0358 - val_accuracy: 0.9960 - val_loss: 0.0173
Epoch 3/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 112ms/step - accuracy: 0.9960 - loss: 0.0160 - val_accuracy: 0.9973 - val_loss: 0.0115
Epoch 4/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 95ms/step - accuracy: 0.9971 - loss: 0.0099 - val_accuracy: 0.9979 - val_loss: 0.0084
Epoch 5/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 104ms/step - accuracy: 0.9983 - loss: 0.0072 - val_accuracy: 0.9986 - val_loss: 0.0062
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 66ms/step
Classification Report:
              precision    recall  f1-score   support

           B       0.97      0.93      0.

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.utils import to_categorical

# 1. Load Excel Data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:  # End of sentence
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Build vocab
def build_vocab(sentences, tags):
    word2idx = {'<PAD>': 0, '<UNK>': 1}
    tag2idx = {}
    for sent in sentences:
        for word in sent:
            if word not in word2idx:
                word2idx[word] = len(word2idx)
    for tag_seq in tags:
        for tag in tag_seq:
            if tag not in tag2idx:
                tag2idx[tag] = len(tag2idx)
    if 'O' not in tag2idx:
        tag2idx['O'] = len(tag2idx)
    idx2tag = {i: t for t, i in tag2idx.items()}
    return word2idx, tag2idx, idx2tag

# 3. Encode and pad
def encode(sentences, tags, word2idx, tag2idx, max_len=50):
    X, y = [], []
    for sent, label_seq in zip(sentences, tags):
        word_ids = [word2idx.get(w, word2idx['<UNK>']) for w in sent]
        label_ids = [tag2idx[l] for l in label_seq]
        while len(word_ids) < max_len:
            word_ids.append(word2idx['<PAD>'])
            label_ids.append(tag2idx['O'])  # Default tag
        X.append(word_ids[:max_len])
        y.append(label_ids[:max_len])
    return np.array(X), np.array(y)

# 4. Build GRU model
def build_model(vocab_size, tag_size, max_len):
    input = tf.keras.Input(shape=(max_len,))
    x = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64)(input)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x)
    x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tag_size, activation='softmax'))(x)
    model = tf.keras.Model(input, x)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# 5. Load and preprocess data
file_path = "/content/IOE.xlsx"  # Update this path to your IOE-tagged file
sentences, tags = load_excel_data(file_path)
word2idx, tag2idx, idx2tag = build_vocab(sentences, tags)
X, y = encode(sentences, tags, word2idx, tag2idx)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train model
model = build_model(len(word2idx), len(tag2idx), max_len=50)
y_train_cat = to_categorical(y_train, num_classes=len(tag2idx))
y_test_cat = to_categorical(y_test, num_classes=len(tag2idx))
model.fit(X_train, y_train_cat, batch_size=32, epochs=5, validation_split=0.1)

# 7. Predict and evaluate
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=-1)
y_true_labels = y_test

# 8. Flatten and convert to tag strings
y_pred_flat = []
y_true_flat = []
for i in range(len(y_true_labels)):
    for j in range(len(y_true_labels[i])):
        if X_test[i][j] != word2idx['<PAD>']:
            y_pred_flat.append(idx2tag[y_pred_labels[i][j]])
            y_true_flat.append(idx2tag[y_true_labels[i][j]])

# 9. Report
print("Classification Report:")
print(classification_report(y_true_flat, y_pred_flat))

print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_true_flat, y_pred_flat):.4f}")
print(f"Precision: {precision_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_true_flat, y_pred_flat, average='weighted'):.4f}")


Epoch 1/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 98ms/step - accuracy: 0.9738 - loss: 0.2799 - val_accuracy: 0.9906 - val_loss: 0.0300
Epoch 2/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 95ms/step - accuracy: 0.9916 - loss: 0.0296 - val_accuracy: 0.9961 - val_loss: 0.0130
Epoch 3/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 105ms/step - accuracy: 0.9963 - loss: 0.0121 - val_accuracy: 0.9980 - val_loss: 0.0085
Epoch 4/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 85ms/step - accuracy: 0.9981 - loss: 0.0071 - val_accuracy: 0.9986 - val_loss: 0.0053
Epoch 5/5
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 101ms/step - accuracy: 0.9990 - loss: 0.0039 - val_accuracy: 0.9991 - val_loss: 0.0040
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step
Classification Report:
              precision    recall  f1-score   support

           E       0.97      0.93      0.95 