In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
import os, re
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from transformers import TFAutoModel, AutoTokenizer

In [None]:
# Load the Kannada dataset
data = pd.read_csv('/content/drive/MyDrive/processed_sentiment_140_en_mr.csv', encoding='utf-8', names=['polarity','id','text','lang'])

In [None]:
# Extract the necessary columns
data_x = data['text'].tolist()
data_y = data['polarity'].tolist()
data_lang = data['lang'].tolist()


In [None]:
data.head()

In [None]:
# Processing label of training/testing data
label_data = data['polarity'].values
senti = [0, 4]
mapping = {senti[x]: x for x in range(len(senti))}

# Integer representation
for x in range(len(label_data)):
    label_data[x] = mapping[label_data[x]]

# Converting to one-hot encoding
y_data = to_categorical(label_data)

In [None]:
y_data[0]

array([0., 1.], dtype=float32)

In [None]:
# Filter the data for Kannada language
language_indices = [i for i, text in enumerate(data_x) if bool(re.search(r'[अ-औक-ह]', str(text)))]
data_x_language= [data_x[i] for i in language_indices]
data_y_language= [data_y[i] for i in language_indices]

# Processing label of training/testing data
label_data = np.array(data_y_language)
senti = [0, 4]
mapping = {senti[x]: x for x in range(len(senti))}
label_data = np.array([mapping[label] for label in label_data])

# Converting to one-hot encoding
y_data = to_categorical(label_data)

# Perform train-test split
raw_docs_train, raw_docs_test, y_train, y_test = train_test_split(data_x_language, y_data, test_size=0.2, random_state=4)


In [None]:
MAX_NB_WORDS = 100000
tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")

print("Pre-processing train data...")
processed_docs_train = []
for line in raw_docs_train:
    tokens = tokenizer.tokenize(line)
    processed_docs_train.append(" ".join(tokens))

print("Pre-processing test data...")
processed_docs_test = []
for line in raw_docs_test:
    tokens = tokenizer.tokenize(line)
    processed_docs_test.append(" ".join(tokens))


In [None]:
# Tokenize input data
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, oov_token="<OOV>")
tokenizer.fit_on_texts(processed_docs_train + processed_docs_test)
word_seq_train = tokenizer.texts_to_sequences(processed_docs_train)
word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)
word_index = tokenizer.word_index

In [None]:
# Pad sequences
max_seq_len = max(len(word_seq_train[0]), len(word_seq_test[0]))
word_seq_train = pad_sequences(word_seq_train, maxlen=max_seq_len)
word_seq_test = pad_sequences(word_seq_test, maxlen=max_seq_len)


In [None]:
# Load the transformer model and tokenizer
model_name = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModel.from_pretrained(model_name)

In [None]:
# Embedding matrix
embed_dim = model.config.hidden_size
print('Preparing embedding matrix...')
words_not_found = []
nb_words = min(MAX_NB_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((nb_words, embed_dim))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    encoded_input = tokenizer.encode(word, add_special_tokens=False)
    if len(encoded_input) > 0:
        embedding_vector = model(np.array([encoded_input]))[0][0].numpy()
        if len(embedding_vector.shape) != 1:
            embedding_vector = np.mean(embedding_vector, axis=0)  # Average pooling over multiple tokens
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)

print('Number of null word embeddings:', np.sum(np.sum(embedding_matrix, axis=1) == 0))


In [None]:
# Model architecture
num_filters = 12
weight_decay = 1e-4

==================================================LSTM============================

In [None]:
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional, Dropout

In [None]:
import tensorflow as tf

In [None]:
#LSTM
lstm_out = 128
num_classes = 2
model_LSTM = Sequential()
model_LSTM.add(Embedding(nb_words, embed_dim,
          weights=[embedding_matrix], input_length=max_seq_len, trainable=False))
model_LSTM.add(Bidirectional(LSTM(lstm_out, dropout=0.2)))
model_LSTM.add(Dense(128, activation = 'relu'))
model_LSTM.add(Dropout(0.5))
model_LSTM.add(Dense(64, activation = 'relu'))
model_LSTM.add(Dense(num_classes, activation='softmax'))  #multi-label (k-hot encoding)
adam = tf.optimizers.Adam()
model_LSTM.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model_LSTM.summary()

In [None]:
#training params
batch_size_LSTM = 256
num_epochs_LSTM = 20

In [None]:
checkpoint_path = "training/LSTM/language_detection/trained_cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [None]:
#model training_LSTM
hist_LSTM = model_LSTM.fit(word_seq_train, y_train, batch_size=batch_size_LSTM, epochs=num_epochs_LSTM,validation_split=0.1, shuffle=True, verbose=1,callbacks=[cp_callback])

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

# Make predictions on the test data
y_pred = model_LSTM.predict(word_seq_test)
y_pred = np.argmax(y_pred, axis=1)

# Convert one-hot encoded y_test to categorical labels
y_test_labels = np.argmax(y_test, axis=1)

# Calculate precision
precision = precision_score(y_test_labels, y_pred)

# Calculate recall
recall = recall_score(y_test_labels, y_pred)

# Calculate accuracy
accuracy = accuracy_score(y_test_labels, y_pred)

# Calculate F1 score
f1 = f1_score(y_test_labels, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("F1 Score:", f1)


muril-lstm-LANGUAGE

=========================================CNN==================================================


In [None]:
import tensorflow as tf
print("Training CNN...")
model = Sequential()
model.add(Embedding(nb_words, embed_dim, weights=[embedding_matrix], input_length=max_seq_len, trainable=False))
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(weight_decay)))
model.add(Dense(2, activation='softmax'))  # Assuming binary classification

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:

# Training parameters
batch_size = 256
num_epochs = 20

In [None]:
# Model training
history = model.fit(word_seq_train, y_train, batch_size=batch_size, epochs=num_epochs, validation_split=0.1, shuffle=True)


In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

# Make predictions on the test data
y_pred = model.predict(word_seq_test)
y_pred = np.argmax(y_pred, axis=1)

# Convert one-hot encoded y_test to categorical labels
y_test_labels = np.argmax(y_test, axis=1)

# Calculate precision
precision = precision_score(y_test_labels, y_pred)

# Calculate recall
recall = recall_score(y_test_labels, y_pred)

# Calculate accuracy
accuracy = accuracy_score(y_test_labels, y_pred)

# Calculate F1 score
f1 = f1_score(y_test_labels, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("F1 Score:", f1)


MURIL-CNN-LANGUAGE

=================================================================================GRU-=================

In [None]:
from keras.layers import Dense, Embedding,  GRU, SpatialDropout1D, Bidirectional, Dropout

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout
gru_out = 128
num_classes = 2
model_GRU = keras.Sequential()
model_GRU.add(Embedding(nb_words, embed_dim, weights=[embedding_matrix], input_length=max_seq_len, trainable=False))
model_GRU.add(Bidirectional(GRU(gru_out, dropout=0.2)))
model_GRU.add(Dense(128, activation='relu'))
model_GRU.add(Dropout(0.5))
model_GRU.add(Dense(64, activation='relu'))
model_GRU.add(Dense(num_classes, activation='softmax'))  # multi-label (k-hot encoding)
adam = tf.optimizers.Adam()
model_GRU.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model_GRU.summary()

In [None]:
#training params
batch_size = 256
num_epochs = 20

In [None]:
checkpoint_path = "/content/drive/MyDrive/Twitter_dataset/trainined_gru_cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                  save_weights_only=True,
                                                 verbose=1)

In [None]:
#model training
hist = model_GRU.fit(word_seq_train, y_train, batch_size=batch_size, epochs=num_epochs,validation_split=0.1, shuffle=True, verbose=1,callbacks=[cp_callback])

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

# Make predictions on the test data
y_pred = model_GRU.predict(word_seq_test)
y_pred = np.argmax(y_pred, axis=1)

# Convert one-hot encoded y_test to categorical labels
y_test_labels = np.argmax(y_test, axis=1)

# Calculate precision
precision = precision_score(y_test_labels, y_pred)

# Calculate recall
recall = recall_score(y_test_labels, y_pred)

# Calculate accuracy
accuracy = accuracy_score(y_test_labels, y_pred)

# Calculate F1 score
f1 = f1_score(y_test_labels, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("F1 Score:", f1)


muril-gru-LANGUAGE