In [None]:
!pip install keras



In [None]:
!pip install tensorflow


Collecting keras<2.16,>=2.15.0 (from tensorflow)
  Using cached keras-2.15.0-py3-none-any.whl (1.7 MB)
Installing collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 3.2.1
    Uninstalling keras-3.2.1:
      Successfully uninstalled keras-3.2.1
Successfully installed keras-2.15.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.models import Sequential
from keras.layers import Dense, Embedding, SimpleRNN, GRU, LSTM, Bidirectional, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
file_path = 'urdu-sentiment-corpus-v1.tsv'
df = pd.read_csv(file_path, delimiter='\t')
print(df.head())

                                               Tweet Class
0  میں نے ایٹم بم بنایا ھے ۔۔۔۔او بھائی ایٹم بمب ...     P
1  چندے سے انقلاب اور عمران خان وزیر اعظم نہیں بن...     N
2                           ٹویٹر کا خیال کیسے آیا ؟     O
3  سرچ انجن گوگل کے نائب صدر نے فضا میں ، 130,000...     P
4    ابھی تک اسکی لہریں کبھی کبھی آ جاتی ہیں یار :أْ     P


In [None]:
df['Tweet'] = df['Tweet'].str.lower()
print(df.head())

                                               Tweet Class
0  میں نے ایٹم بم بنایا ھے ۔۔۔۔او بھائی ایٹم بمب ...     P
1  چندے سے انقلاب اور عمران خان وزیر اعظم نہیں بن...     N
2                           ٹویٹر کا خیال کیسے آیا ؟     O
3  سرچ انجن گوگل کے نائب صدر نے فضا میں ، 130,000...     P
4    ابھی تک اسکی لہریں کبھی کبھی آ جاتی ہیں یار :أْ     P


In [None]:
X = df['Tweet']
y = df['Class']

In [None]:
y_binary = np.where(y == 'P', 1, 0)

In [None]:
if np.any((y_binary != 0) & (y_binary != 1)):
    raise ValueError("Labels contain mixed or unknown targets.")

In [None]:
missing_labels = np.isnan(y_binary)
if np.any(missing_labels):
    raise ValueError("Missing values found in labels.")

In [None]:
y_binary = np.expand_dims(y_binary, axis=-1)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
max_sequence_length = 100
X = pad_sequences(X, maxlen=max_sequence_length)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.25, random_state=42)

In [None]:
def create_model(model_type, num_layers, dropout_rate):
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_sequence_length))
    if model_type == 'RNN':
        for _ in range(num_layers):
            model.add(SimpleRNN(64, return_sequences=True))
    elif model_type == 'GRU':
        for _ in range(num_layers):
            model.add(GRU(64, return_sequences=True))
    elif model_type == 'LSTM':
        for _ in range(num_layers):
            model.add(LSTM(64, return_sequences=True))
    elif model_type == 'BiLSTM':
        for _ in range(num_layers):
            model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [None]:
results = []
num_layers_options = [2, 3]
dropout_rate_options = [0.3, 0.7]

In [None]:
for model_type in ['RNN', 'GRU', 'LSTM', 'BiLSTM']:
    for num_layers in num_layers_options:
        for dropout_rate in dropout_rate_options:
            model = create_model(model_type, num_layers, dropout_rate)
            model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
            model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=0)
            y_pred_prob = model.predict(X_test)
            y_pred_mean = np.mean(y_pred_prob, axis=1)
            y_pred = (y_pred_mean > 0.5).astype(int)
            if y_test.shape != y_pred.shape:
                raise ValueError("Shapes of y_test and y_pred do not match.")
            if np.isnan(y_test).any():
                raise ValueError("Missing values found in y_test.")
            if y_test.shape != y_pred.shape:
                raise ValueError("Shapes of y_test and y_pred do not match.")
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            results.append({
                'Model': model_type,
                'Num Layers': num_layers,
                'Dropout Rate': dropout_rate,
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1-score': f1
            })



  _warn_prf(average, modifier, msg_start, len(result))




In [None]:
results_df = pd.DataFrame(results)
print(results_df)

     Model  Num Layers  Dropout Rate  Accuracy  Precision    Recall  F1-score
0      RNN           2           0.3     0.512   1.000000  0.039370  0.075758
1      RNN           2           0.7     0.552   0.532751  0.960630  0.685393
2      RNN           3           0.3     0.492   0.000000  0.000000  0.000000
3      RNN           3           0.7     0.500   1.000000  0.015748  0.031008
4      GRU           2           0.3     0.512   0.510288  0.976378  0.670270
5      GRU           2           0.7     0.616   0.618321  0.637795  0.627907
6      GRU           3           0.3     0.596   0.569149  0.842520  0.679365
7      GRU           3           0.7     0.580   0.677419  0.330709  0.444444
8     LSTM           2           0.3     0.572   0.621951  0.401575  0.488038
9     LSTM           2           0.7     0.564   0.673077  0.275591  0.391061
10    LSTM           3           0.3     0.512   0.608696  0.110236  0.186667
11    LSTM           3           0.7     0.556   0.766667  0.181

**QUESTION-2**

In [None]:
!pip install --upgrade keras



In [None]:
!pip install tensorflow


Collecting keras<2.16,>=2.15.0 (from tensorflow)
  Using cached keras-2.15.0-py3-none-any.whl (1.7 MB)
Installing collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 3.2.1
    Uninstalling keras-3.2.1:
      Successfully uninstalled keras-3.2.1
Successfully installed keras-2.15.0


In [None]:
from gensim.models import KeyedVectors
from keras.preprocessing.sequence import pad_sequences

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import KeyedVectors
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [None]:
word2vec_model_path = '/content/drive/MyDrive/GoogleNews-vectors-negative300.bin'
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True, limit=20000)

In [None]:
file_path = 'urdu-sentiment-corpus-v1.tsv'
df = pd.read_csv(file_path, delimiter='\t')

In [None]:
X = df['Tweet']
y = df['Class']
y_binary = np.where(y == 'P', 1, 0)

In [None]:
max_sequence_length = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

In [None]:
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_binary, test_size=0.25, random_state=42)

In [None]:
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 300))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]

In [None]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=300, weights=[embedding_matrix],
                    input_length=max_sequence_length, trainable=False))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.7))
model.add(LSTM(64))
model.add(Dropout(0.7))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=1)
y_pred_prob = model.predict(X_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
threshold = 0.5
y_pred = (y_pred_prob > threshold).astype(int)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
results = []
for threshold in np.arange(0.1, 0.5, 0.1):
    y_pred = (y_pred_prob > threshold).astype(int)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append({'Threshold': threshold, 'Precision': precision, 'Recall': recall, 'F1-score': f1})

In [None]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
results_df = pd.DataFrame(results)
print(results_df)

Accuracy: 0.492
Precision: 0.508
Recall: 1.0
F1-score: 0.6737400530503979
   Threshold  Precision  Recall  F1-score
0        0.1      0.508     1.0   0.67374
1        0.2      0.508     1.0   0.67374
2        0.3      0.508     1.0   0.67374
3        0.4      0.508     1.0   0.67374


In [None]:
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index


In [None]:
glove_embeddings_path = '/content/drive/MyDrive/glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_embeddings_path)

In [None]:
file_path = 'urdu-sentiment-corpus-v1.tsv'
df = pd.read_csv(file_path, delimiter='\t')

In [None]:
X = df['Tweet']
y = df['Class']

In [None]:
y_binary = np.where(y == 'P', 1, 0)

In [None]:
max_sequence_length = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

In [None]:
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_binary, test_size=0.25, random_state=42)

In [None]:
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 300))
for word, i in tokenizer.word_index.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=300, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
model.add(LSTM(64, return_sequences=True))  # First LSTM layer
model.add(Dropout(0.7))
model.add(LSTM(64))  # Second LSTM layer
model.add(Dropout(0.7))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7e1f05fdbd60>

In [None]:
y_pred_prob = model.predict(X_test)



In [None]:
threshold = 0.5
y_pred = (y_pred_prob > threshold).astype(int)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
results = []
for threshold in np.arange(0.1, 0.6, 0.1):
    y_pred = (y_pred_prob > threshold).astype(int)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append({'Threshold': threshold, 'Precision': precision, 'Recall': recall, 'F1-score': f1})

In [None]:
print("Evaluation metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
results_df = pd.DataFrame(results)
print(results_df)


Evaluation metrics:
Accuracy: 0.496
Precision: 0.5454545454545454
Recall: 0.047244094488188976
F1-score: 0.08695652173913043
   Threshold  Precision    Recall  F1-score
0        0.1   0.508000  1.000000  0.673740
1        0.2   0.508000  1.000000  0.673740
2        0.3   0.508000  1.000000  0.673740
3        0.4   0.508696  0.921260  0.655462
4        0.5   0.545455  0.047244  0.086957


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import KeyedVectors
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
fasttext_model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/wiki-news-300d-1M.vec', binary=False, limit=20000)

In [None]:
file_path = 'urdu-sentiment-corpus-v1.tsv'
df = pd.read_csv(file_path, delimiter='\t')

In [None]:
X = df['Tweet']
y = df['Class']
y_binary = np.where(y == 'P', 1, 0)

In [None]:
max_sequence_length = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_binary, test_size=0.25, random_state=42)

In [None]:
embedding_dim = 300
num_words = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in fasttext_model:
        embedding_matrix[i] = fasttext_model[word]

In [None]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=300, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.7))
model.add(LSTM(64))
model.add(Dropout(0.7))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7e1f05fa8040>

In [None]:
y_pred_prob = model.predict(X_test)



In [None]:
threshold = 0.5
y_pred = (y_pred_prob > threshold).astype(int)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
print("Evaluation metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Evaluation metrics:
Accuracy: 0.496
Precision: 1.0
Recall: 0.007874015748031496
F1-score: 0.015625


In [None]:
results = []
for threshold in np.arange(0.1, 0.6, 0.1):
    y_pred = (y_pred_prob > threshold).astype(int)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append({'Threshold': threshold, 'Precision': precision, 'Recall': recall, 'F1-score': f1})

In [None]:
results_df = pd.DataFrame(results)
print(results_df)


   Threshold  Precision    Recall  F1-score
0        0.1      0.508  1.000000  0.673740
1        0.2      0.508  1.000000  0.673740
2        0.3      0.508  1.000000  0.673740
3        0.4      0.508  1.000000  0.673740
4        0.5      1.000  0.007874  0.015625


In [None]:
#ELMO
!pip install allennlp




In [None]:
!pip show allennlp


Name: allennlp
Version: 2.10.1
Summary: An open-source NLP research library, built on PyTorch.
Home-page: https://github.com/allenai/allennlp
Author: Allen Institute for Artificial Intelligence
Author-email: allennlp@allenai.org
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: base58, cached-path, dill, fairscale, filelock, h5py, huggingface-hub, jsonnet, lmdb, more-itertools, nltk, numpy, protobuf, pytest, requests, sacremoses, scikit-learn, scipy, sentencepiece, spacy, tensorboardX, termcolor, torch, torchvision, tqdm, traitlets, transformers, typer, wandb
Required-by: 


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from allennlp.modules.elmo import Elmo, batch_to_ids
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten

In [None]:
file_path = 'urdu-sentiment-corpus-v1.tsv'
df = pd.read_csv(file_path, delimiter='\t')

In [None]:
X = df['Tweet']
y = df['Class']
y_binary = np.where(y == 'P', 1, 0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.25, random_state=42)

In [None]:
options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
elmo = Elmo(options_file, weight_file, num_output_representations=1, dropout=0)

In [None]:
def get_elmo_embeddings(sentences, max_length=50):
    character_ids = batch_to_ids([sentence.split() for sentence in sentences])
    embeddings = elmo(character_ids)['elmo_representations'][0].detach().numpy()
    padded_embeddings = np.zeros((len(sentences), max_length, embeddings.shape[2]))
    for i, embedding in enumerate(embeddings):
        if embedding.shape[0] < max_length:
            padded_embeddings[i, :embedding.shape[0], :] = embedding
        else:
            padded_embeddings[i, :, :] = embedding[:max_length, :]
    return padded_embeddings

In [None]:
X_train_elmo = get_elmo_embeddings(X_train)
X_test_elmo = get_elmo_embeddings(X_test)

In [None]:
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train_elmo.shape[1], X_train_elmo.shape[2])))
model.add(Dropout(0.7))
model.add(Flatten())  # Add Flatten layer to convert 3D output to 2D
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train_elmo, y_train, epochs=10, batch_size=64, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7c05f93e89d0>

In [None]:
loss, accuracy = model.evaluate(X_test_elmo, y_test, verbose=0)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.5239999890327454


In [None]:
y_pred_prob = model.predict(X_test_elmo)
y_pred = (y_pred_prob > 0.5).astype(int)



In [None]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Precision: 0.532258064516129
Recall: 0.5196850393700787
F1-score: 0.5258964143426295
