In [2]:
import pandas as pd
import re
import torch.nn as nn
import torch
from torch.utils.data import Dataset,DataLoader,random_split
import time

In [3]:
PATH_TO_DATASET = "/content/data.csv"

In [4]:
def normalizeString(s):
    re.sub(r'[^\w\s\n]|[\d]', ' ', s)
    return s.strip()

In [5]:
# creating corpus
df = pd.read_csv(PATH_TO_DATASET)
print(f"file read successfully")
df.drop_duplicates(inplace=True)
df.drop(columns=["1","2","3"],axis=1,inplace=True)

corpus = df["Urdu Text Data"].tolist()
tokenized = []

for item in corpus:
    item = normalizeString(item)

    item = item.split(" ")

    tokenized.append(item)

file read successfully


# Using word2vec

In [109]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

import nltk
nltk.download('punkt')  # Download the Punkt tokenizer models
# Define Word2Vec model parameters
vector_size = 128  # Size of the word vectors
window_size = 5  # Maximum distance between the current and predicted word within a sentence
min_count = 1  # Ignores all words with a total frequency lower than this
workers = 2  # Number of CPU cores to use for training

# Train Word2Vec model
word2vec_model = Word2Vec(
    sentences=tokenized,
    vector_size=vector_size,
    window=window_size,
    min_count=min_count,
    workers=workers
)

# Save the trained model
word2vec_model.save("my_word2vec_model")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [110]:
model = Word2Vec.load("./my_word2vec_model")

In [111]:
embedding_matrix = model.wv.vectors
vocab_size, embedding_dim = embedding_matrix.shape

In [112]:
vocab_list = word2vec_model.wv.index_to_key
# Print the list of words
print(vocab_list)

['ہے', 'میں', 'کے', 'کی', 'سے', 'اور', 'تو', 'کا', 'بھی', 'نہیں', '', 'ہیں', 'کو', 'اس', 'بہت', 'آپ', 'یہ', 'ہی', 'کر', 'نے', 'کہ', 'پر', 'کیا', 'ہو', 'ایک', 'گا', 'تھا', 'ہوں', 'کوئی', 'وہ', 'جو', 'اب', 'ان', 'بھائی', 'کچھ', 'نہ', 'اگر', 'بات', 'مجھے', 'لیے', 'رہا', 'یا', 'کرنے', 'سب', 'لیکن', 'شکریہ', 'پاکستان', 'ساتھ', 'جائے', 'پھر', 'فلم', 'جی', 'اللہ', 'گے', 'تک', 'گیا', 'گی', 'تھی', 'رہے', 'ہم', 'اچھی', 'کام', 'لئے', 'کسی', 'اچھا', 'کریں', 'پہلے', 'زیادہ', 'ہوتا', 'دیا', 'جب', 'صرف', 'دیکھ', 'ویسے', 'بعد', 'میچ', 'دو', 'ابھی', 'مگر', 'طرح', 'اردو', 'رہی', 'آج', 'ہوا', 'میرے', 'اپنی', 'ٹی', 'پی', 'جس', 'یہاں', 'کم', 'استعمال', 'اپنے', 'دے', 'ہر', 'کرنا', 'جا', 'صاحب', 'ہوتی', 'کرتے', 'عمدہ', 'پسند', 'خان', 'بس', 'وقت', 'بنا', 'سی', 'والے', 'لیں', 'ٹیم', 'تھے', 'ھے', 'گئی', 'ہوئی', 'زبردست', 'نام', 'آئی', 'میری', 'سکتا', 'خود', 'کافی', 'میرا', 'اسی', 'دی', 'السلام', 'جاتا', 'ہوئے', 'پاکستانی', 'دیکھی', 'دیں', 'کرتا', 'گئے', 'کبھی', 'لے', 'کیوں', 'اسے', 'سکتے', 'اچھے', 'شروع', 'نا',

**some functions and architectures from**: [Sean Robertson](https://github.com/spro)

**inspired from**: [Sean Robertson](https://github.com/spro)

# Model Architecture

In [161]:

class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, word_embeddings,dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding.from_pretrained(word_embeddings, freeze=True)

        self.gru = nn.GRU(hidden_size, hidden_size, num_layers=1 ,batch_first=True)

        # self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)

        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))

        output, _ = self.gru(embedded)

        # output, _ = self.lstm(output)


        return output

In [162]:
class Classifier(nn.Module):
     def __init__(self,hidden_size,output_units,dropout_p) -> None:
          super(Classifier, self).__init__()
          self.dropout = nn.Dropout(dropout_p)
          self.dense1 = nn.Linear(hidden_size,hidden_size//2)
          self.tanh = nn.Tanh()
          self.dense2 = nn.Linear(hidden_size//2,output_units)
     def forward(self,x):
          output = self.tanh(self.dropout(self.dense1(x)))
          output = self.dense2(output)
          return output

In [163]:
class SentimentAnalyzer(nn.Module):
    def __init__(self, hidden_size,output_units,word_embeddings,dropout_enc,dropout_clsf) -> None:
        super().__init__()
        self.encoder = EncoderRNN(hidden_size,word_embeddings=word_embeddings,dropout_p=dropout_enc)
        self.classifier = Classifier(hidden_size,output_units,dropout_clsf)

    def forward(self,x):
        y = self.encoder(x)
        # obtaining the encoding from last sequence only
        y = y[:,y.shape[1]-1,:]
        y = self.classifier(y)
        return y

# Dataset and Dataloader

In [143]:
class Sentiment_Dataset(Dataset):
    def __init__(self,word2vec_model,max_seq_len=1000) -> None:


        super().__init__()

        self.max_seq_len = max_seq_len


        self.word2vec_model = word2vec_model

        # Create word-to-index mapping dynamically
        self.word2index = {word: idx for idx, word in enumerate(self.word2vec_model.wv.index_to_key)}

        print(f"index created successfully")

        self.df = pd.read_csv(PATH_TO_DATASET)
        print(f"file read successfully")
        self.df.drop_duplicates(inplace=True)
        self.df.drop(columns=["1","2","3"],axis=1,inplace=True)

        assert self.df["Lables"] is not None,"No attribute Labels found"
        labels = self.df["Lables"].unique()

        mappings = {}
        for idx,value in enumerate(labels):
            mappings[value] = idx

        self.df["Labels"] = self.df["Lables"].replace(mappings)

        print("Labels updated successfully")


    def word2index(self,word):
        return self.word2index[word]

    def indexesFromSentence(self,sentence):
        return [self.word2index[word] for word in sentence.split(' ')]


    def normalizeString(self,s):
        re.sub(r'[^\w\s\n]|[\d]', ' ', s)
        return s.strip()


    def filterSentence(self,x):
        return len(x.split(" ")) <= self.max_seq_len


    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row  = self.df.iloc[index]
        sentence = row["Urdu Text Data"]

        sentence = self.normalizeString(sentence)

        index_list = self.indexesFromSentence(sentence)

        # padding vector
        input_val = torch.full(size=(self.max_seq_len,1),fill_value=0).view(-1)
        input_val[:len(index_list)] = torch.tensor(index_list)



        # input_val = torch.tensor(index_list,dtype=torch.int32).view(-1)

        target_val  = torch.tensor(row["Labels"],dtype=torch.long)

        return input_val,target_val



In [144]:
dataset = Sentiment_Dataset(word2vec_model)

index created successfully
file read successfully
Labels updated successfully


In [145]:
for i in dataset:
    print(i[1])
    break

tensor(0)


In [146]:
total_samples = len(dataset)

test_samples = int(total_samples * 0.15 )
val_samples = test_samples

train_samples = total_samples - test_samples - val_samples

print(f"Total Samples: {total_samples} Train Samples: {train_samples} Validation Samples: {val_samples} Test Samples: {test_samples}")


Total Samples: 9764 Train Samples: 6836 Validation Samples: 1464 Test Samples: 1464


In [147]:
train_data, val_data , test_data = random_split(dataset=dataset,lengths=[train_samples,val_samples,test_samples])
train_loader = DataLoader(train_data,batch_size=32,shuffle=True)
val_loader = DataLoader(val_data,batch_size=32,shuffle=False)
test_loader = DataLoader(test_data,batch_size=32,shuffle=False)

# Criterions and Metrics

In [148]:
def acc_metrics(prediction_tensor,target_tensor):

    _, predictions = torch.max(prediction_tensor, dim=1)


    # Check if the predictions are correct by comparing with target labels
    correct_predictions = (predictions == target_tensor).float()  # Convert boolean to float

    # Calculate accuracy
    accuracy = correct_predictions.sum() / len(target_tensor)

    return accuracy

In [149]:
criterion = nn.CrossEntropyLoss()

In [59]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

# Training

In [150]:
def train_epoch(dataloader, model, optimizer,criterion,device):


    total_loss = 0
    total_acc = 0

    for data in dataloader:
        input_tensor, target_tensor = data

        input_tensor = input_tensor.to(device)
        target_tensor= target_tensor.to(device)


        optimizer.zero_grad()

        output_tensor = model(input_tensor)



        loss = criterion(output_tensor,target_tensor)
        acc  = acc_metrics(output_tensor,target_tensor)

        # print(f"loss {loss} acc {acc}" )

        loss.backward()

        optimizer.step()

        total_loss += loss.item()
        total_acc += acc.item()

    num_batches = len(dataloader)

    return total_loss / num_batches , total_acc/ num_batches

In [151]:
def evaluate_epoch(dataloader, model,criterion,device):

    total_loss = 0
    total_acc = 0

    for data in dataloader:
        input_tensor, target_tensor = data

        input_tensor = input_tensor.to(device)
        target_tensor= target_tensor.to(device)


        optimizer.zero_grad()

        output_tensor = model(input_tensor)

        loss = criterion(output_tensor,target_tensor)
        acc  = acc_metrics(output_tensor,target_tensor)

        total_loss += loss.item()
        total_acc += acc.item()

    num_batches = len(dataloader)

    return total_loss / num_batches , total_acc/ num_batches


In [152]:
EMBEDDING_DIMS = 128;
DROPOUT = 0.5;
OUT_UNITS=3;
DEVICE="cuda" if torch.cuda.is_available() else "cpu"
print(f"using {DEVICE} for computation")

using cuda for computation


### Optimizer

In [171]:
model = SentimentAnalyzer(hidden_size=EMBEDDING_DIMS,word_embeddings=torch.tensor(embedding_matrix),output_units=OUT_UNITS,dropout_enc=0.5,dropout_clsf=0.7)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [172]:
# loading pretrained model
chkpoint = torch.load("/content/drive/MyDrive/assignment_3_checkpoint.pth")
model.load_state_dict(chkpoint["model_state_dict"])

<All keys matched successfully>

In [173]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [154]:
def train(train_dataloader, model, n_epochs, optimizer,criterion,
               print_every=5, plot_every=5):

    plot_losses = []
    plot_accs = []

    eval_losses = []
    eval_accs = []


    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    print_acc_total = 0  # Reset every print_every
    plot_acc_total = 0  # Reset every plot_every

    for epoch in range(1, n_epochs + 1):
        loss,acc = train_epoch(train_dataloader,model, optimizer ,criterion,device=DEVICE)

        print_acc_total += acc
        plot_acc_total += acc

        print_loss_total += loss
        plot_loss_total += loss


        if epoch % print_every == 0:
          print_loss_avg = print_loss_total / print_every
          print_acc_avg = print_acc_total/ print_every

          print_loss_total = 0
          print_acc_total = 0

          val_loss, val_acc = evaluate_epoch(val_loader,model,criterion,device=DEVICE)

          eval_losses.append(val_loss)
          eval_accs.append(val_acc)

          print("train " +'%s (%d %d%%) %.4f' % (epoch, epoch / n_epochs * 100, print_loss_avg , print_acc_avg))

          print(f"evaluation, loss: {val_loss} acc: {val_acc}")

        if epoch % plot_every == 0:
          plot_loss_avg = plot_loss_total / plot_every
          plot_acc_avg = plot_acc_total / plot_every

          plot_losses.append(plot_loss_avg)
          plot_accs.append(plot_acc_avg)

          plot_loss_total = 0
          plot_acc_total  = 0

    showPlot(plot_losses)
    return plot_losses, plot_accs , eval_losses, eval_accs


In [155]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [157]:
train(train_loader,model,100,optimizer,criterion)

train 5 (5 1%) 0.3716
evaluation, loss: 1.0795911109965781 acc: 0.3768115944188574


KeyboardInterrupt: 

In [177]:
evaluate_epoch(train_loader,model,criterion,"cuda")

(0.7190847125287368, 0.7205607476078462)

In [35]:
chkpoint = {
    "model_state_dict": model.state_dict()
}


torch.save(chkpoint,"/content/drive/MyDrive/assignment_3_lstm_checkpoint.pth")