<a href="https://colab.research.google.com/github/harshitEbPandey/transferLearningNER/blob/mlp_models/MLP_hindi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
from datasets import load_dataset

In [None]:
!pip install fasttext

In [None]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.utils import to_categorical
from keras import optimizers
from keras import regularizers
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import keras
import fasttext
from tensorflow import random
from sklearn.metrics import f1_score, classification_report, ConfusionMatrixDisplay

### Tokenizing and saving txt (X_train_hindi, Y_train_hindi)

In [None]:
from datasets import load_dataset
hi_ner = load_dataset('ai4bharat/naamapadam', 'hi')

In [None]:
# analysing the hindi dataset
hindi_ds = hi_ner.map()

# shapes of each
print(f"Train: {hindi_ds['train'].shape}")
print(f"Test: {hindi_ds['test'].shape}")
print(f"Validation: {hindi_ds['validation'].shape}")

# loading each
train_hindi_ds = hindi_ds['train'].to_pandas()
test_hindi_ds = hindi_ds['test'].to_pandas()
vali_hindi_ds = hindi_ds['validation'].to_pandas()

In [None]:
# combine each of the words in the individual sentences and create a common df
# text output is of the form in X_train.txt: `tokens`
# text output is of the form in Y_train.txt: `ner_tags`
import pandas as pd
with open('/content/drive/MyDrive/Spring 2023/CSE 572 data mining/Project/Final/X_train_hindi_ds.txt', 'w') as f1, open('/content/drive/MyDrive/Spring 2023/CSE 572 data mining/Project/Final/Y_train_hindi_ds.txt', 'w') as f2:
  for index, row in train_hindi_ds.iterrows():
      f1.write('\n'.join(row['tokens']) + '\n')
      f2.write('\n'.join(map(str, row['ner_tags'])) + '\n')

### Loading the tokens and tags from txt

In [None]:
file_loc = '/content/drive/MyDrive/Spring 2023/CSE 572 data mining/Project/Final'

In [None]:
# all tokens and tags are presaved...
# taking them input and making unique dictionary
# Open the input file and read the words
with open(file_loc + '/X_train_hindi_ds.txt', 'r') as f:
    words = f.read().splitlines()

In [None]:
with open(file_loc + '/Y_train_hindi_ds.txt', 'r') as f:
    tags = f.read().splitlines()

In [None]:
len(tags) == len(words)

True

In [None]:
word_tag_unique = {}
for idx, word in enumerate(words):
  if word not in word_tag_unique:
    word_tag_unique[word] = [tags[idx]]
  elif word in word_tag_unique and tags[idx] not in word_tag_unique[word]:
    word_tag_unique[word].append(tags[idx])

In [None]:
with open(file_loc + '/train_hindi_unique_tokens.txt', 'w') as f:
  for word in word_tag_unique.keys():
    f.write(word + '\n')

### Training fasttext on unique words dimensions = 30

In [None]:
# run fasttext model
model = fasttext.train_unsupervised(file_loc + '/train_hindi_unique_tokens.txt', model='skipgram', lr=0.05, dim=30, ws=5, epoch=5)

In [None]:
# save the model
model.save_model(file_loc + '/fasttext_model_train_hindi_unique_tokens.bin')

In [None]:
# load the model
model = fasttext.load_model(file_loc + '/fasttext_model_train_hindi_unique_tokens.bin')



In [None]:
# every word should have an embedding now
# can get using model[word]

### Loading the training set

In [None]:
# constructing x_train_df

word_dict = {i: word for i, word in enumerate(words)}
X_train_df = pd.DataFrame.from_dict(word_dict, orient='index', columns=['tokens'])

In [None]:
# constructing y_train_df

tag_dict = {i: tag for i, tag in enumerate(tags)}
Y_train_df = pd.DataFrame.from_dict(tag_dict, orient='index', columns=['tags'])

### Saving word vectors in chunks

In [None]:
def process_chunk(start_idx, end_idx, df):
    # Get the words for the chunk
    words = df.loc[start_idx:end_idx, 'tokens'].tolist()

   # Get the vectors for the words using the fastText model
    vectors = [model[word] for word in words]

    # list of index
    idx = range(start_idx, end_idx + 1)

    # Create a new dataframe with the vectors
    vectors_df = pd.DataFrame({'tokens': words, 'vectors': vectors}, index = idx)

    # # Save the chunk to a CSV file
    vectors_df.to_csv(file_loc + f'/vectors/hindi_lite_vectors_{start_idx}_{end_idx}.csv', index=False)

In [None]:
X_train_df.shape

In [None]:
# Loop over the dataframe in chunks
chunk_size = 1000000
for i in range(0, len(X_train_df), chunk_size):
    start_idx = i
    end_idx = min(i + chunk_size, len(X_train_df)-1)
    process_chunk(start_idx, end_idx, X_train_df)
    # Print progress
    print(f'Saved vectors for words {start_idx} to {end_idx} to CSV file.')

In [None]:
# encode the labels
label_encoder = LabelEncoder()
label_encoder.fit(Y_train_df.tags.values)
Y_train_hindi_enc = label_encoder.transform(Y_train_df.tags.values)

In [None]:
# get each class' weights
unique, counts = np.unique(Y_train_hindi_enc, return_counts=True)
class_weights = dict(zip(unique, np.round(sum(counts) / counts)))

# more the number of records - lesser the weights are (for balancing dataset)
print(class_weights)

{0: 1.0, 1: 29.0, 2: 31.0, 3: 32.0, 4: 27.0, 5: 30.0, 6: 106.0}


In [None]:
Y_train_hindi_enc.shape

(22029408,)

### Training MLP CLassifier

In [None]:
np.unique(Y_train_hindi_enc)

array([0, 1, 2, 3, 4, 5, 6])

In [None]:
num_features = 30
print(num_features)
num_classes = 7

30


In [None]:
# Set the random seed
np.random.seed(42)
random.set_seed(42)

mlp = Sequential()
mlp.add(Dense(units=100, activation='relu', input_dim=num_features))
mlp.add(Dense(units=num_classes, activation='softmax'))
mlp.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

### Load test data

In [None]:
# Load the pre-trained fastText model
model = fasttext.load_model(file_loc + '/fasttext_model_train_hindi_unique_tokens.bin')

# Open the input file and read the words
with open(file_loc + '/X_test_hindi_ds.txt', 'r') as f:
    words = [line.strip() for line in f if line.strip()]

# Create a dictionary with the words as values and integer indices as keys
word_dict = {i: word for i, word in enumerate(words)}

# Create a dataframe from the dictionary
df_test = pd.DataFrame.from_dict(word_dict, orient='index', columns=['tokens'])

# Loop over the dataframe in chunks
start_idx = 0
end_idx = len(df_test) - 1
process_chunk(start_idx, end_idx, df_test)
# Print progress
print(f'Saved test vectors for words {start_idx} to {end_idx} to CSV file.')

# get the labels in another dataframe
with open(file_loc +'/Y_test_hindi_ds.txt', 'r') as f:
    test_tags = [line.strip() for line in f if line.strip()]

tag_dict = {i: tag for i, tag in enumerate(test_tags)}

df_test_tags = pd.DataFrame.from_dict(tag_dict, orient='index', columns=['tags'])

X_test = pd.read_csv(file_loc + f'/vectors/hindi_lite_vectors_{start_idx}_{end_idx}.csv')
X_test.head()

In [None]:
# preparing data for prediction

target_names = ['Class 0', 'Class 1', 'Class 2', 'Class 3', 'Class 4', 'Class 5', 'Class 6']

y_test_hindi_ds = label_encoder.transform(df_test_tags.tags.values)

X_test = np.array([np.fromstring(s[1:-1].replace('\n', ' '), sep=' ') for s in X_test.vectors])

Y_test = y_test_hindi_ds

y_true_labels = Y_test

In [None]:
# training model pipeline (in chunks)

# Load data in chunks, and train
# training sets
# Loop over the dataframe in chunks
chunk_size = 1000000
for i in range(0, len(X_train_df), chunk_size):
    start_idx = i
    end_idx = min(i + chunk_size, len(X_train_df)-1)
    X_train = pd.read_csv(file_loc + f'/vectors/hindi_lite_vectors_{start_idx}_{end_idx}.csv')
    X_train = np.array([np.fromstring(s[1:-1].replace('\n', ' '), sep=' ') for s in X_train.vectors])
    Y_train = Y_train_hindi_enc[start_idx: end_idx + 1]

    # Print progress
    print(f'Loaded vectors for words {start_idx} to {end_idx} to CSV file.')
    
    # train mlp on this
    mlp.fit(X_train, Y_train, class_weight=class_weights, epochs = 50, batch_size = 200)

    # predict on test
    Y_test_pred = mlp.predict(X_test)

    # get pred labels
    y_pred_labels = np.argmax(Y_test_pred, axis=1)

    # print classification report
    print(classification_report(y_true_labels, y_pred_labels, target_names=target_names))

    # save model
    mlp.save(file_loc + f'/models/mlp_model_hindi_lite_50_epochs.h5')

    # Print progress
    print(f'Completed training with vectors for words {start_idx} to {end_idx}')

### Baseline report 1 :P Did not save the model

```
     Class 0       0.92      0.70      0.80      6996
     Class 1       0.18      0.42      0.25       263
     Class 2       0.21      0.31      0.25       239
     Class 3       0.16      0.40      0.23       257
     Class 4       0.16      0.41      0.23       253
     Class 5       0.27      0.43      0.33       302
     Class 6       0.11      0.32      0.16        95
    accuracy                           0.65      8405
   macro avg       0.28      0.43      0.32      8405
weighted avg       0.80      0.65      0.70      8405
```
Completed training with vectors for words 7000000 to 8000000 (200 epoochs each
)

In [None]:
# train for another 100 epochs, last 29000 tokens 
chunk_size = 1000000
for i in range(22000000, len(X_train_df), chunk_size):
    start_idx = i
    end_idx = min(i + chunk_size, len(X_train_df)-1)
    X_train = pd.read_csv(file_loc + f'/vectors/hindi_lite_vectors_{start_idx}_{end_idx}.csv')
    X_train = np.array([np.fromstring(s[1:-1].replace('\n', ' '), sep=' ') for s in X_train.vectors])
    Y_train = Y_train_hindi_enc[start_idx: end_idx + 1]

    # Print progress
    print(f'Loaded vectors for words {start_idx} to {end_idx} to CSV file.')
    
    # train mlp on this
    mlp.fit(X_train, Y_train, class_weight=class_weights, epochs = 150, batch_size = 200)

    # predict on test
    Y_test_pred = mlp.predict(X_test)

    # get pred labels
    y_pred_labels = np.argmax(Y_test_pred, axis=1)

    # print classification report
    print(classification_report(y_true_labels, y_pred_labels, target_names=target_names))

    # save model
    mlp.save(file_loc + f'/models/mlp_model_hindi_lite_50_epochs_retrain.h5')

    # Print progress
    print(f'Completed training with vectors for words {start_idx} to {end_idx}')

### Final good baseline -- results on 50 epochs -- and 150 epoch final 29000 retrained:
```
              precision    recall  f1-score   support

     Class 0       0.92      0.71      0.80      6996
     Class 1       0.18      0.38      0.25       263
     Class 2       0.19      0.31      0.23       239
     Class 3       0.13      0.47      0.20       257
     Class 4       0.18      0.36      0.24       253
     Class 5       0.26      0.41      0.32       302
     Class 6       0.15      0.29      0.20        95

    accuracy                           0.65      8405
   macro avg       0.29      0.42      0.32      8405
weighted avg       0.80      0.65      0.71      8405
```

In [None]:
# test eval on only labels from 1 - 6:
# loading the mlp  model
mlp=keras.models.load_model('/content/drive/MyDrive/Spring 2023/CSE 572 data mining/Project/Final/models (1)/mlp_model_hindi_lite_50_epochs_retrain.h5')

In [None]:
def eval_results_hmm(y_test, y_pred, clf_report=True, conf_matrix=False, exclude_0=True):
    """
        Report evaluation metrics for the NER model (HMM)
    """
    labels = np.arange(1,7) if exclude_0 else np.arange(0,7)
    print("Weighted F1 of HMM = {:.4f}".format(f1_score(y_test, y_pred, average='weighted', labels=labels)))
    if clf_report:
        print(classification_report(y_test, y_pred, labels=labels, digits=3, zero_division=0))
    if conf_matrix:
        ConfusionMatrixDisplay.from_predictions(y_test, y_pred, labels=labels)
        plt.show()
    print("================================================================\n")

In [None]:
# Open the input file and read the words
with open(file_loc + '/X_test_hindi_ds.txt', 'r') as f:
    words = [line.strip() for line in f if line.strip()]

# get the labels in another dataframe
with open(file_loc +'/Y_test_hindi_ds.txt', 'r') as f:
    test_tags = [line.strip() for line in f if line.strip()]

In [None]:
# Load the pre-trained fastText model
model = fasttext.load_model(file_loc + '/fasttext_model_train_hindi_unique_tokens.bin')



In [None]:
X_test = pd.read_csv(file_loc + f'/vectors/hindi_lite_vectors_{start_idx}_{end_idx}.csv')
X_test.head()

label_encoder = LabelEncoder()
label_encoder.fit(test_tags)
test_tags = label_encoder.transform(test_tags)

# preparing data for prediction

target_names = ['Class 0', 'Class 1', 'Class 2', 'Class 3', 'Class 4', 'Class 5', 'Class 6']

y_test_hindi_ds = test_tags

X_test = np.array([np.fromstring(s[1:-1].replace('\n', ' '), sep=' ') for s in X_test.vectors])

Y_test = y_test_hindi_ds

y_true_labels = Y_test

In [None]:
 # predict on test
Y_test_pred = mlp.predict(X_test)

# get pred labels
y_pred_labels = np.argmax(Y_test_pred, axis=1)

# print classification report
print(classification_report(y_true_labels, y_pred_labels, target_names=target_names))


              precision    recall  f1-score   support

     Class 0       0.92      0.71      0.80      6996
     Class 1       0.18      0.38      0.25       263
     Class 2       0.19      0.31      0.23       239
     Class 3       0.13      0.47      0.20       257
     Class 4       0.18      0.36      0.24       253
     Class 5       0.26      0.41      0.32       302
     Class 6       0.15      0.29      0.20        95

    accuracy                           0.65      8405
   macro avg       0.29      0.42      0.32      8405
weighted avg       0.80      0.65      0.71      8405



In [None]:
eval_results_hmm(y_true_labels, y_pred_labels)

Weighted F1 of HMM = 0.2480
              precision    recall  f1-score   support

           1      0.183     0.376     0.246       263
           2      0.187     0.314     0.234       239
           3      0.129     0.471     0.203       257
           4      0.182     0.356     0.241       253
           5      0.262     0.414     0.321       302
           6      0.150     0.295     0.199        95

   micro avg      0.177     0.382     0.242      1409
   macro avg      0.182     0.371     0.241      1409
weighted avg      0.188     0.382     0.248      1409


