In [37]:
!pip install keras_preprocessing
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Load the data in CoNLL format
def load_data(file_path):
    data = []
    sentence = []
    labels = []
    with open(file_path, 'r', encoding='utf8') as f:
        for line in f:
            if line.startswith('#') or line == '\n':
                if len(sentence) > 0:
                    data.append((sentence, labels))
                    sentence = []
                    labels = []
            else:
                parts = line.split()
                sentence.append(parts[0])
                if(len(parts[-1])>1):
                  labels.append(str((parts[-1]))[2:])
                else:
                  labels.append(parts[-1])
    if len(sentence) > 0:
        data.append((sentence, labels))
    return data
def predict_ner(sentence, word_to_index, label_to_index, model):
    words = sentence.split()
    indexed_sentence = [word_to_index[word] if word in word_to_index else 0 for word in words]
    padded_sentence = pad_sequences([indexed_sentence], maxlen=max_length, padding='post')
    predictions = model.predict(padded_sentence)
    predicted_labels = np.argmax(predictions, axis=-1)
    predicted_tags = [list(label_to_index.keys())[list(label_to_index.values()).index(pred)] for pred in predicted_labels[0]]
    return list(zip(words, predicted_tags))


train_data = load_data('/HI-Hindi/hi_train.conll')
dev_data = load_data('/HI-Hindi/hi_dev.conll')
test_data = load_data('/HI-Hindi/hi_test.conll')

# Preprocess the data
word_to_index = {}
label_to_index = {}
for sentence, labels in train_data + dev_data + test_data:
    for word in sentence:
        if word not in word_to_index:
            word_to_index[word] = len(word_to_index)
    for label in labels:
        if label not in label_to_index:
            label_to_index[label] = len(label_to_index)

train_sentences = [[word_to_index[word] for word in sentence] for sentence, _ in train_data]
train_labels = [[label_to_index[label] for label in labels] for _, labels in train_data]
dev_sentences = [[word_to_index[word] for word in sentence] for sentence, _ in dev_data]
dev_labels = [[label_to_index[label] for label in labels] for _, labels in dev_data]
test_sentences = [[word_to_index[word] for word in sentence] for sentence, _ in test_data]
test_labels = [[label_to_index[label] for label in labels] for _, labels in test_data]

max_length = max(len(sentence) for sentence in train_sentences + dev_sentences + test_sentences)
num_words = len(word_to_index)
num_labels = len(label_to_index)

train_sentences = pad_sequences(train_sentences, maxlen=max_length, padding='post')
train_labels = pad_sequences(train_labels, maxlen=max_length, padding='post')
train_labels = to_categorical(train_labels, num_classes=num_labels)
dev_sentences = pad_sequences(dev_sentences, maxlen=max_length, padding='post')
dev_labels = pad_sequences(dev_labels, maxlen=max_length, padding='post')
dev_labels = to_categorical(dev_labels, num_classes=num_labels)
test_sentences = pad_sequences(test_sentences, maxlen=max_length, padding='post')
test_labels = pad_sequences(test_labels, maxlen=max_length, padding='post')
test_labels = to_categorical(test_labels, num_classes=num_labels)

# Build the model architecture
model = Sequential()
model.add(Embedding(num_words, 128, input_length=max_length))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dense(num_labels, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(train_sentences, train_labels, validation_data=(dev_sentences, dev_labels), epochs=1, batch_size=32)

# Evaluate the model on the test data
loss, accuracy = model.evaluate(test_sentences, test_labels, batch_size=32)
print('Test loss:', loss)
print('Test accuracy:', accuracy)

# Use the model to make predictions on new data
sentence = input()
output=predict_ner(sentence, word_to_index, label_to_index, model)
print(output)

# Define the fine-grained to coarse-grained tag mapping
fine_to_coarse_map = {
    'Facility': 'LOC',
    'OtherLOC': 'LOC',
    'HumanSettlement': 'LOC',
    'Station': 'LOC',
    'VisualWork': 'CW',
    'MusicalWork': 'CW',
    'WrittenWork': 'CW',
    'ArtWork': 'CW',
    'Software': 'CW',
    'MusicalGRP': 'GRP',
    'PublicCORP': 'GRP',
    'PrivateCORP': 'GRP',
    'AerospaceManufacturer': 'GRP',
    'SportsGRP': 'GRP',
    'CarManufacturer': 'GRP',
    'ORG': 'GRP',
    'Scientist': 'PER',
    'Artist': 'PER',
    'Athlete': 'PER',
    'Politician': 'PER',
    'Cleric': 'PER',
    'SportsManager': 'PER',
    'OtherPER': 'PER',
    'Clothing': 'PROD',
    'Vehicle': 'PROD',
    'Food': 'PROD',
    'Drink': 'PROD',
    'OtherPROD': 'PROD',
    'Medication/Vaccine': 'MED',
    'MedicalProcedure': 'MED',
    'AnatomicalStructure': 'MED',
    'Symptom': 'MED',
    'Disease': 'MED',
    'O': 'O'
}

# Define a function to map the fine-grained tags to coarse-grained tags
def map_tags(predictions):
    return [fine_to_coarse_map[tag] for tag in predictions]

# Make predictions on the test data
predictions = model.predict(test_sentences)
predicted_labels = np.argmax(predictions, axis=-1)

# Convert the predictions to tag labels
predicted_tags_fine = [list(label_to_index.keys())[list(label_to_index.values()).index(pred)] for pred in predicted_labels[0]]
predicted_tags_coarse = map_tags(predicted_tags_fine)

# Convert the true labels to tag labels
true_tags_fine = [list(label_to_index.keys())[list(label_to_index.values()).index(true)] for true in np.argmax(test_labels, axis=-1)[0]]
true_tags_coarse = map_tags(true_tags_fine)

# Compute the accuracy on the test data for both fine-grained and coarse-grained tags
accuracy_fine = sum(np.array(predicted_tags_fine) == np.array(true_tags_fine)) / len(true_tags_fine)
accuracy_coarse = sum(np.array(predicted_tags_coarse) == np.array(true_tags_coarse)) / len(true_tags_coarse)

# Print the results
print('Fine-grained accuracy:', accuracy_fine)
print('Coarse-grained accuracy:', accuracy_coarse)


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Test loss: 0.23447223007678986
Test accuracy: 0.9467990398406982
झियान चीन यूरोपीय में कोर्डोबा है
[('झियान', 'O'), ('चीन', 'O'), ('यूरोपीय', 'O'), ('में', 'O'), ('कोर्डोबा', 'O'), ('है', 'O')]
Fine-grained accuracy: 0.9555555555555556
Coarse-grained accuracy: 0.9555555555555556


Fine-grained accuracy: 0.9264705882352942
Coarse-grained accuracy: 0.9558823529411765
