In [None]:
!pip install transformers

In [None]:
pip install tensorflow


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
import os
import re

In [None]:
data = pd.read_csv(r"path to data", encoding='latin', names=['polarity', 'id', 'text','lang'])


In [None]:
# Processing label of training/testing data
label_data = data['polarity'].values
senti = [0, 4]
mapping = {}
for x in range(len(senti)):
    mapping[senti[x]] = x

# Integer representation
for x in range(len(label_data)):
    label_data[x] = mapping[label_data[x]]

# Convert to one-hot encoding
y_data = to_categorical(label_data)

In [None]:
# Text cleaning function
def text_cleaner(line):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u2066"
                               u"\u2069"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    punc = '''؛₹|!‼→¿()-[]{};–৷।:،۔•„'“‘—’”"…\,<>=./?@#$%^&*_~»«'''
    line = re.sub(r'http\S+', '', line)
    line = emoji_pattern.sub(r'', line)
    line = bytes(line, 'utf-8').decode('utf-8', 'ignore')
    for ele in line:
        if ele in punc:
            line = line.replace(ele, "")
            line = re.sub(r"^\s+|\s+$", "", line)
            line = re.sub(' +', ' ', line)
    return line

In [None]:
# Tokenize and convert the processed documents to sequences
data_x = data['text'].tolist()
raw_docs_train, raw_docs_test, y_train, y_test = train_test_split(data_x, y_data, test_size=0.2, random_state=4)

MAX_SEQ_LEN = 128

print("Pre-processing train data...")
processed_docs_train = []
for line in tqdm(raw_docs_train):
    tokens = tokenizer.tokenize(line)
    processed_docs_train.append(" ".join(tokens))

print("Pre-processing test data...")
processed_docs_test = []
for line in tqdm(raw_docs_test):
    tokens = tokenizer.tokenize(line)
    processed_docs_test.append(" ".join(tokens))

In [None]:
# Encode the processed docs
word_seq_train = tokenizer.batch_encode_plus(
    processed_docs_train,
    padding=True,
    truncation=True,
    max_length=MAX_SEQ_LEN,
    return_tensors="tf"
)
word_seq_test = tokenizer.batch_encode_plus(
    processed_docs_test,
    padding=True,
    truncation=True,
    max_length=MAX_SEQ_LEN,
    return_tensors="tf"
)

word_seq_train = np.array(word_seq_train["input_ids"])
attention_mask_train = np.array(word_seq_train["attention_mask"])
word_seq_test = np.array(word_seq_test["input_ids"])
attention_mask_test = np.array(word_seq_test["attention_mask"])

In [None]:
pip install sentencepiece


In [None]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')
model = AutoModel.from_pretrained('ai4bharat/indic-bert')


In [None]:
# Retrieve embeddings from the Indic-BERT model
train_embeddings = model(word_seq_train, attention_mask=attention_mask_train).last_hidden_state
test_embeddings = model(word_seq_test, attention_mask=attention_mask_test).last_hidden_state


In [None]:
# Model parameters
num_filters = 12
weight_decay = 1e-4
num_classes = 2

In [None]:

# CNN architecture
print("Training CNN...")
model = Sequential()
model.add(Conv1D(num_filters, 7, activation='relu', padding='same', input_shape=(MAX_SEQ_LEN, train_embeddings.shape[2])))
model.add(MaxPooling1D(2))
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(weight_decay)))
model.add(Dense(num_classes, activation='softmax'))

adam = tf.optimizers.Adam()
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()

In [None]:
# Training params
batch_size = 256
num_epochs = 20
checkpoint_path = "/content/drive/MyDrive/Twitter_dataset/trainined_cnn_cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

In [None]:
# Create a callback that saves the model's weights
cp_callback = ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, verbose=1)

# Model training
hist = model.fit(train_embeddings, y_train, batch_size=batch_size, epochs=num_epochs,
                 validation_split=0.1, shuffle=True, verbose=1, callbacks=[cp_callback])

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

# Make predictions on the test data
y_pred = model_GRU.predict(word_seq_test)
y_pred = np.argmax(y_pred, axis=1)

# Convert one-hot encoded y_test to categorical labels
y_test_labels = np.argmax(y_test, axis=1)

# Calculate precision
precision = precision_score(y_test_labels, y_pred)

# Calculate recall
recall = recall_score(y_test_labels, y_pred)

# Calculate accuracy
accuracy = accuracy_score(y_test_labels, y_pred)

# Calculate F1 score
f1 = f1_score(y_test_labels, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("F1 Score:", f1)


indicbert-cnn-language

==================================================LSTM============================

In [None]:
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional, Dropout

In [None]:
import tensorflow as tf

In [None]:
#LSTM
lstm_out = 128
num_classes = 2
model_LSTM = Sequential()
model_LSTM.add(Embedding(nb_words, embed_dim,
          weights=[embedding_matrix], input_length=max_seq_len, trainable=False))
model_LSTM.add(Bidirectional(LSTM(lstm_out, dropout=0.2)))
model_LSTM.add(Dense(128, activation = 'relu'))
model_LSTM.add(Dropout(0.5))
model_LSTM.add(Dense(64, activation = 'relu'))
model_LSTM.add(Dense(num_classes, activation='softmax'))  #multi-label (k-hot encoding)
adam = tf.optimizers.Adam()
model_LSTM.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model_LSTM.summary()

In [None]:
#training params
batch_size_LSTM = 256
num_epochs_LSTM = 20

In [None]:
checkpoint_path = "training/LSTM/language_detection/trained_cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [None]:
#model training_LSTM
hist_LSTM = model_LSTM.fit(word_seq_train, y_train, batch_size=batch_size_LSTM, epochs=num_epochs_LSTM,validation_split=0.1, shuffle=True, verbose=1,callbacks=[cp_callback])

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

# Make predictions on the test data
y_pred = model_LSTM.predict(word_seq_test)
y_pred = np.argmax(y_pred, axis=1)

# Convert one-hot encoded y_test to categorical labels
y_test_labels = np.argmax(y_test, axis=1)

# Calculate precision
precision = precision_score(y_test_labels, y_pred)

# Calculate recall
recall = recall_score(y_test_labels, y_pred)

# Calculate accuracy
accuracy = accuracy_score(y_test_labels, y_pred)

# Calculate F1 score
f1 = f1_score(y_test_labels, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("F1 Score:", f1)


indicbert-lstm-LANGUAGE

=================================================================================GRU-=================

In [None]:
from keras.layers import Dense, Embedding,  GRU, SpatialDropout1D, Bidirectional, Dropout

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout
gru_out = 128
num_classes = 2
model_GRU = keras.Sequential()
model_GRU.add(Embedding(nb_words, embed_dim, weights=[embedding_matrix], input_length=max_seq_len, trainable=False))
model_GRU.add(Bidirectional(GRU(gru_out, dropout=0.2)))
model_GRU.add(Dense(128, activation='relu'))
model_GRU.add(Dropout(0.5))
model_GRU.add(Dense(64, activation='relu'))
model_GRU.add(Dense(num_classes, activation='softmax'))  # multi-label (k-hot encoding)
adam = tf.optimizers.Adam()
model_GRU.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model_GRU.summary()

In [None]:
#training params
batch_size = 256
num_epochs = 20

In [None]:
checkpoint_path = "/content/drive/MyDrive/Twitter_dataset/trainined_gru_cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                  save_weights_only=True,
                                                 verbose=1)

In [None]:
#model training
hist = model_GRU.fit(word_seq_train, y_train, batch_size=batch_size, epochs=num_epochs,validation_split=0.1, shuffle=True, verbose=1,callbacks=[cp_callback])

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

# Make predictions on the test data
y_pred = model_GRU.predict(word_seq_test)
y_pred = np.argmax(y_pred, axis=1)

# Convert one-hot encoded y_test to categorical labels
y_test_labels = np.argmax(y_test, axis=1)

# Calculate precision
precision = precision_score(y_test_labels, y_pred)

# Calculate recall
recall = recall_score(y_test_labels, y_pred)

# Calculate accuracy
accuracy = accuracy_score(y_test_labels, y_pred)

# Calculate F1 score
f1 = f1_score(y_test_labels, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("F1 Score:", f1)


**indicbert*-gru-LANGUAGE