<a href="https://colab.research.google.com/github/leman-cap13/my_projects/blob/main/name_entity_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download rohitr4307/ner-dataset

In [None]:
import zipfile

In [None]:
with zipfile.ZipFile('/content/ner-dataset.zip','r') as zip_ref:
  zip_ref.extractall()

In [None]:
import pandas as pd
df=pd.read_csv('/content/NER_Dataset.csv')
df

In [None]:
df.columns

In [None]:
df['Word'].iloc[4]

In [None]:
df['Tag'].unique()

In [None]:
#B-gpe, B-tim, B-geo, B-org,I-geo, I-org, I-tim ---BIO Formats

In [None]:
import numpy as np

In [None]:
# First Step Tokenization
# our input is already token

In [None]:
# Convert Token to token id

In [None]:
import ast
df['Word'] = df['Word'].apply(ast.literal_eval)


In [None]:
df['Word'].head()

In [None]:
all_tokens = [token for row in df['Word'] for token in row]
all_tokens

In [None]:
vocab = {token: idx+1 for idx, token in enumerate(sorted(set(all_tokens)))}

In [None]:
vocab_size=len(vocab)+1

In [None]:
vocab_size

In [None]:
def tokens_to_ids(token_list, vocab):
    return [vocab.get(token, 0) for token in token_list]

token_ids = tokens_to_ids(all_tokens, vocab)
print(token_ids)


In [None]:
df['token_ids'] = df['Word'].apply(lambda tokens: tokens_to_ids(tokens, vocab))

In [None]:
df['token_ids']

In [None]:
# tag token id cevirmeliyik

In [None]:
df['Tag'].dtype

In [None]:
import ast

df['Tag'] = df['Tag'].apply(ast.literal_eval)


In [None]:
uniques_tag=sorted(df['Tag'].explode().unique())
uniques_tag

In [None]:
tag2id = {tag: idx for idx, tag in enumerate(uniques_tag)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

In [None]:
tag2id

In [None]:
id2tag

In [None]:
df['tag_ids'] = df['Tag'].apply(lambda tags: [tag2id[tag] for tag in tags])

In [None]:
df['tag_ids']

In [None]:
# Indi Token id leri paddin etmek lazimdiki hem imput hem output uzunlugu eyni olsun

In [None]:
import tensorflow as tf

In [None]:
df['seq_len'] = df['token_ids'].apply(len)

print("Max Length:", df['seq_len'].max())

In [None]:
MAX_LEN = int(df['seq_len'].quantile(0.95))

In [None]:
MAX_LEN

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_LEN = 35

df['input_ids'] = list(pad_sequences(df['token_ids'], maxlen=MAX_LEN, padding='post', value=0))
df['label_ids'] = list(pad_sequences(df['tag_ids'], maxlen=MAX_LEN, padding='post', value=tag2id['O']))


In [None]:
df['input_ids']

In [None]:
df['label_ids']

In [None]:
# Train teste bolek

In [None]:
from sklearn.model_selection import train_test_split
X = np.array(df['input_ids'].to_list(),dtype=np.int32)
y = np.array(df['label_ids'].to_list(),dtype=np.int32)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print("X dtype:", X.dtype)

In [None]:
# Tensorflow datasina cevirek

In [None]:
batch_size=32
train_dataset=tf.data.Dataset.from_tensor_slices((X_train,y_train)).batch(batch_size)
valid_dataset=tf.data.Dataset.from_tensor_slices((X_test,y_test)).batch(batch_size)

In [None]:
train_dataset

In [None]:
valid_dataset

In [None]:
#Encoder base transformer model

encoder_inputs=tf.keras.Input(shape=[None,], name='encoder_inputs')

#Embedding layer
embed_layer=tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=128, mask_zero=True, name='embed_layer')

#encoder embedding
encoder_embed=embed_layer(encoder_inputs)


#Positional embed layer
embed_size=128
pos_embed_layer=tf.keras.layers.Embedding(MAX_LEN, embed_size)

pos_encoder=tf.keras.layers.Lambda(lambda x: tf.range(start=0,limit=tf.shape(x)[1],
                                                            delta=1))(encoder_inputs)

positional_embedding=pos_embed_layer(pos_encoder)


#concat tokrn and positional embedding
encoder_embedding=positional_embedding+encoder_embed

#Add multihead attentioan layer

num_heads=3
encoder_attention=tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_size)(encoder_embedding,encoder_embedding)
encoder_attention=tf.keras.layers.LayerNormalization(epsilon=1e-6)(encoder_attention+encoder_embedding)


# add Feed Forward (Dense) layer
ff_dim=512
encoder_ff=tf.keras.layers.Dense(ff_dim, activation='relu')(encoder_attention)
encoder_ff=tf.keras.layers.Dense(embed_size)(encoder_ff)
encoder_ff=tf.keras.layers.LayerNormalization(epsilon=1e-6)(encoder_ff+encoder_attention)


#Add output layer
output_layer=tf.keras.layers.Dense(len(tag2id), activation='softmax')(encoder_ff)

In [None]:
#Make model
model=tf.keras.Model(inputs=encoder_inputs, outputs=output_layer)

In [None]:
#model compiling
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
#model fitting
model.fit(train_dataset, validation_data=valid_dataset, epochs=10)

In [None]:
print("Max input_id:", np.max(X))
print("Vocab size:", vocab_size)


In [None]:
num_examples = 5
y_pred_probs = model.predict(valid_dataset)

y_pred = np.argmax(y_pred_probs, axis=-1)

for i in range(num_examples):
    print(f"\nExample {i+1}")
    print("Input Tokens:")
    input_tokens = [token for token, id in vocab.items() if id in X_test[i]]
    print(input_tokens)

    print("\nTrue Tags:")
    true_tags = [id2tag[id] for id in y_test[i] if id != tag2id['O']]
    print(true_tags)

    print("\nPredicted Tags:")
    pred_tags = [id2tag[id] for id in y_pred[i] if id != tag2id['O']]
    print(pred_tags)


In [None]:
for i in range(3):  # İlk 3 nümunəyə bax
    input_ids = X_test[i]
    true_ids = y_test[i]
    pred_ids = y_pred[i]

    print(f"\nExample {i+1}")
    print(f"{'Token':15} {'True Tag':10} {'Predicted Tag'}")
    print("-" * 40)

    for token_id, true_id, pred_id in zip(input_ids, true_ids, pred_ids):
        if token_id != 0:  # padding yoxdursa
            token = [k for k, v in vocab.items() if v == token_id][0]
            true_tag = id2tag[true_id]
            pred_tag = id2tag[pred_id]
            print(f"{token:15} {true_tag:10} {pred_tag}")
