In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
!pip install langdetect
from langdetect import detect

nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=24def42c2d0dc8ca9bf6df36ced570b01ed3ee1cd0d4b8876f6dfcdc23ac668c
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
raw_hotel_reviews = pd.read_csv('/content/drive/MyDrive/NLP_assignment/hotel-reviews 1.csv')

In [8]:
def dataset_prepare(dataset):
  dataset = dataset.copy()
  columns_to_drop = ['User_ID', 'Browser_Used', 'Device_Used']
  dataset = dataset.drop(columns=columns_to_drop, axis=1)
  dataset['Is_Response'].replace({'happy': 1, 'not happy': 0}, inplace=True)
  dataset = dataset.rename(columns={'Is_Response': 'label'})
  dataset = dataset.rename(columns={'Description': 'review'})

  return dataset

df = dataset_prepare(raw_hotel_reviews)

df.head()

Unnamed: 0,review,label
0,The room was kind of clean but had a VERY stro...,0
1,I stayed at the Crown Plaza April -- - April -...,0
2,I booked this hotel through Hotwire at the low...,0
3,Stayed here with husband and sons on the way t...,1
4,My girlfriends and I stayed here to celebrate ...,0


In [10]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from collections import Counter

class TextDataset(Dataset):
    def __init__(self, dataframe, max_length):
        self.dataframe = dataframe
        self.max_length = max_length
        self.vocab = self.build_vocab()
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(self.dataframe['label'])

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        review = self.dataframe.iloc[idx]['review']
        label = self.label_encoder.transform([self.dataframe.iloc[idx]['label']])[0]
        review_encoded = self.encode_text(review)
        return torch.LongTensor(review_encoded), label

    def build_vocab(self):
        words = ' '.join(self.dataframe['review']).split()
        word_counts = Counter(words)
        vocab = {'<PAD>': 0, '<UNK>': 1}
        for word, count in word_counts.items():
            if count >= 5:  # Filter out rare words
                vocab[word] = len(vocab)
        return vocab

    def encode_text(self, text):
        tokens = text.split()
        encoded = [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens][:self.max_length]
        if len(encoded) < self.max_length:
            encoded += [self.vocab['<PAD>']] * (self.max_length - len(encoded))
        return encoded

In [6]:
# Define the LSTM model architecture
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, _) = self.lstm(embedded)
        output = self.fc(hidden[-1])
        return output.squeeze(1)

In [None]:
# Prepare data
max_length = 100  # Maximum sequence length
embedding_dim = 100  # Dimension of word embeddings
hidden_dim = 128  # Dimension of hidden layer in LSTM
output_dim = len(df['label'].unique())  # Number of output classes
vocab_size = len(TextDataset(df, max_length).vocab)

# Split data into train and validation sets
train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = TextDataset(train_data, max_length)
val_dataset = TextDataset(val_data, max_length)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# Initialize the model, loss function, and optimizer
model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False):
        inputs, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Training Loss: {total_loss / len(train_loader)}')

# Evaluate the model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in tqdm(val_loader, desc='Validation', leave=False):
        inputs, labels = batch
        outputs = model(inputs)
        predicted = torch.argmax(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Validation Accuracy: {correct / total}')



Training Loss: 0.5923094532205828




Training Loss: 0.4650576812285907




Training Loss: 0.3635785241574967




Training Loss: 0.30211385319609907


Epoch 5/10:  57%|█████▋    | 279/487 [01:12<00:49,  4.19it/s]