<a href="https://colab.research.google.com/github/iamRahulB/Pytorch-practice/blob/main/TEXT/N_Grams_using_Lstm_on_custom_data_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip3 install -q torch==2.2.0 torchtext==0.17.0 --index-url https://download.pytorch.org/whl/cu118

In [3]:
!pip install -q -U portalocker==2.8.2

In [1]:
import torch
from torch import nn
import torch.optim as optim
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data import get_tokenizer
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import requests
import zipfile
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import numpy as np

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
lemma=WordNetLemmatizer()
stop_words=stopwords.words("english")

In [4]:
df=pd.read_csv('https://github.com/LawrenceDuan/IMDb-Review-Analysis/raw/refs/heads/master/IMDb_Reviews.csv')

In [5]:
# response =requests.get('https://www.kaggle.com/api/v1/datasets/download/moazeldsokyx/bbc-news')

# with open("data.zip",'wb') as f :
#     f.write(response.content)
# with zipfile.ZipFile('data.zip' ,'r') as f:
#     f.extractall("data")

In [9]:
# df=pd.read_csv('/content/data/bbc-text.csv')

In [6]:
df=df.drop_duplicates()

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0
3,One of the most unheralded great works of anim...,1
4,"It was the Sixties, and anyone with long hair ...",0


In [14]:
# sns.countplot(df['sentiment'])
# plt.show()

In [8]:
df[df.duplicated()]

Unnamed: 0,review,sentiment


In [9]:
# so we can see classes are littlebit imbalanced
df['sentiment'].value_counts()


Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,24884
0,24698


In [231]:
# df[df['category'] !='entertainment']    # we can drop entertainment class but for now our goal is not to improve accuracy but to practice the pytorch text so will keep it

In [10]:
df=df.sample(12000)
len(df)

12000

In [11]:
def preprocess(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[^a-z\s]', '', text, flags=re.IGNORECASE)
    text=[ lemma.lemmatize(t) for t in word_tokenize(text) if t not in stop_words]

    return " ".join(text)

In [12]:
df['cleaned']=df['review'].apply(preprocess)

In [13]:
df.drop('review',inplace=True,axis=1)

In [14]:
df.head(5)

Unnamed: 0,sentiment,cleaned
48730,1,WONDERBIRD certainly unbelievably refined cart...
9360,0,This possibly one worst movie I displeasure wa...
23237,0,Jean Luc Godards Marxist polemic close unwatch...
42403,1,Lupin set Morocco looking legendary treasure H...
6283,1,As kid I thought movie great It animal beautif...


In [15]:
# class_map={text:ind for ind,text in enumerate(df['sentiment'].unique().tolist())}
# class_map

In [16]:
# df['category']=df['category'].map(class_map)

In [17]:
train_data,test_data=train_test_split(df,test_size=0.3,random_state=42,)

In [18]:
train_data.head(4)

Unnamed: 0,sentiment,cleaned
44851,1,It first foremost chick flick romantic comedy ...
11619,1,Old People Show Im watching show since I recod...
18568,1,This one Stan Laurels best solo comedy teaming...
21845,1,I got laugh one lot called comedy The big ship...


In [19]:
test_data.head(4)

Unnamed: 0,sentiment,cleaned
29273,1,What Prospero twentieth century life Why man w...
29063,0,Seriously I mean seriously I first started wat...
23705,1,Skip McCoy Richard Widmark pickpocket Candys J...
24159,0,History teacher Mrs Tingle seems student Leigh...


In [20]:
# we have now train data and test data now we can go ahead and do vectorization here.
count=CountVectorizer(ngram_range=(1,2))         #ngram_range=(1,2)
train_vector=count.fit_transform(train_data['cleaned']).toarray()
test_vector=count.transform(test_data['cleaned']).toarray()

In [21]:
count.vocabulary_['cut'], len(count.vocabulary_)

(152121, 782487)

In [22]:
train_vector.shape,test_vector.shape

((8400, 782487), (3600, 782487))

In [23]:
#  now we need to make iterable dataset so we can iterate it and pass it to dataloader

class CustomDataset(Dataset):
    def __init__(self,texts,labels):
        super().__init__()
        self.texts=texts
        self.labels=labels

    def __len__(self):
        return len(self.texts)
    def __getitem__(self,idx):
        text=self.texts[idx]
        label=self.labels[idx]
        return text,label

train_iter=CustomDataset(train_vector,train_data['sentiment'].tolist())
test_iter=CustomDataset(test_vector,test_data['sentiment'].tolist())

In [24]:
train_iter[0], test_iter[0]      # so all good now we are good to go for dataloader, also note we need to make collate function also

((array([0, 0, 0, ..., 0, 0, 0]), 1), (array([0, 0, 0, ..., 0, 0, 0]), 1))

In [36]:
label_pipeline=lambda x: int(x)

In [37]:
# note here we not applied troch.tensor to each text vector in list as they all are in same values so directly need to do it on whole nested thing
def collate_batch(batch):
    all_text = []
    all_label = []
    for text, label in batch:
        all_text.append(text)
        all_label.append(label_pipeline(label))

    all_text_np = np.array(all_text, dtype=np.float32)
    all_label_np = np.array(all_label, dtype=np.int64)

    return torch.tensor(all_text_np), torch.tensor(all_label_np)



In [38]:
train_loader=DataLoader(train_iter,shuffle=True,collate_fn=collate_batch,batch_size=32)
test_loader=DataLoader(test_iter,shuffle=False,collate_fn=collate_batch,batch_size=32)

In [39]:
class MyModel(nn.Module):
    def __init__(self,input_size,num_classes):
        super().__init__()

        self.lstm=nn.LSTM(input_size,64,num_layers=1,batch_first=True)
        # self.relu=nn.ReLU()
        self.drop=nn.Dropout(0.7)
        self.linear=nn.Linear(in_features=64,out_features=num_classes)

    def forward(self,x):
        lstm,(hn,cn)=self.lstm(x)
        # relu=self.relu(lstm[:,-1,:])
        drop=self.drop(lstm[:,-1,:])
        out=self.linear(drop)

        return out

In [40]:
input_size=len(count.vocabulary_)
num_classes=2
device=("cuda" if torch.cuda.is_available() else 'cpu')

model=MyModel(input_size,num_classes).to(device)

In [41]:
# class_map

In [42]:
with torch.inference_mode():
    model_out=model(next(iter(train_loader))[0].unsqueeze(1).to(device))

In [43]:
model_out.softmax(dim=1).argmax(dim=1)     # so our model is working really good now we need to train our model

tensor([1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
        1, 0, 1, 1, 0, 0, 1, 0], device='cuda:0')

In [44]:

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=0.01)

epochs = 5

for epoch in tqdm(range(epochs), desc="Epochs"):

    train_loss = 0.0
    train_acc = 0.0
    model.train()

    for batch_idx, (X_train, y_train) in tqdm(enumerate(train_loader, 1), desc='Training Mode', leave=False):
        X_train, y_train = X_train.to(device), y_train.to(device)

        train_batch_logits = model(X_train.unsqueeze(1))

        train_batch_preds = torch.softmax(train_batch_logits, dim=1).argmax(dim=1)
        train_batch_loss = loss_fn(train_batch_logits, y_train)
        train_batch_acc = accuracy_score(y_train.cpu(), train_batch_preds.cpu())

        optimizer.zero_grad()
        train_batch_loss.backward()
        optimizer.step()
        train_loss += train_batch_loss.item()
        train_acc += train_batch_acc

    train_loss /= len(train_loader)
    train_acc /= len(train_loader)

    model.eval()
    test_loss = 0.0
    test_acc = 0.0
    with torch.inference_mode():
        for batch_idx, (X_test, y_test) in enumerate(test_loader, 1):
            X_test, y_test = X_test.to(device), y_test.to(device)

            test_batch_logits = model(X_test.unsqueeze(1))

            test_batch_preds = torch.softmax(test_batch_logits, dim=1).argmax(dim=1)
            test_batch_loss = loss_fn(test_batch_logits, y_test)
            test_batch_acc = accuracy_score(y_test.cpu(), test_batch_preds.cpu())

            test_loss += test_batch_loss.item()
            test_acc += test_batch_acc

    test_loss /= len(test_loader)
    test_acc /= len(test_loader)

    print(f"Epoch: {epoch+1} | Train Loss: {train_loss:.4f} | Train ACC: {train_acc*100:.2f} | Test Loss: {test_loss:.4f} | Test ACC: {test_acc*100:.2f}")

print("Training complete.")


Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training Mode: 0it [00:00, ?it/s]

Epoch: 1 | Train Loss: 0.4078 | Train ACC: 81.29 | Test Loss: 0.2924 | Test ACC: 88.30


Training Mode: 0it [00:00, ?it/s]

Epoch: 2 | Train Loss: 0.1621 | Train ACC: 93.48 | Test Loss: 0.3077 | Test ACC: 87.78


Training Mode: 0it [00:00, ?it/s]

Epoch: 3 | Train Loss: 0.0541 | Train ACC: 98.21 | Test Loss: 0.3053 | Test ACC: 88.38


Training Mode: 0it [00:00, ?it/s]

Epoch: 4 | Train Loss: 0.0203 | Train ACC: 99.33 | Test Loss: 0.3258 | Test ACC: 87.31


Training Mode: 0it [00:00, ?it/s]

Epoch: 5 | Train Loss: 0.0087 | Train ACC: 99.76 | Test Loss: 0.3299 | Test ACC: 87.72
Training complete.


**Always remember to make labels into integer or u will get huge accuracy i.e 100 percent accuracy**

In [46]:
test_texts = {
    "Positive": [
        "This product exceeded my expectations in every way. The quality is top-notch and the customer service was outstanding.",
        "I had an amazing experience at this restaurant. The food was delicious, and the staff were incredibly friendly and attentive."
    ],
    "Negative": [
        "I was very disappointed with this product. It broke after just one use and the customer service was unhelpful.",
        "The service at this restaurant was terrible. The food took forever to arrive and when it did, it was cold and tasteless."
    ]
}

In [50]:
class_map= {
    "Positive": 1,
    "Negative":0
}

In [51]:
for key, val in test_texts.items():
    expected = class_map[key]
    for text in val:
        text_array = count.transform([text]).toarray()
        text_tensor = torch.tensor(text_array, dtype=torch.float32)

        with torch.inference_mode():
            logits = model(text_tensor.unsqueeze(1).to(device))
            prediction = torch.argmax(logits, dim=1).item()

        print(f"Text: {text}")
        print(f"Expected: {expected} ")
        print(f"Predicted: {prediction} \n")

Text: This product exceeded my expectations in every way. The quality is top-notch and the customer service was outstanding.
Expected: 1 
Predicted: 1 

Text: I had an amazing experience at this restaurant. The food was delicious, and the staff were incredibly friendly and attentive.
Expected: 1 
Predicted: 1 

Text: I was very disappointed with this product. It broke after just one use and the customer service was unhelpful.
Expected: 0 
Predicted: 0 

Text: The service at this restaurant was terrible. The food took forever to arrive and when it did, it was cold and tasteless.
Expected: 0 
Predicted: 0 



In [None]:
logits