In [6]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np


In [2]:
# Install required libraries
# !pip install transformers torch pandas scikit-learn

# Load the dataset
df = pd.read_csv('updated_final.csv')
df



In [226]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters, keeping apostrophes for contractions
    text = re.sub(r'[^a-zA-Z\'\s]', ' ', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Custom list of stopwords, excluding negations and other sentiment-bearing words
    custom_stop_words = set([
        'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'
    ])

    # List of negations and sentiment-bearing words to keep
    important_words = set([
        'not', 'no', 'never', 'none', 'nobody', 'nowhere', 'neither', 'nor',
        'doesn\'t', 'isn\'t', 'wasn\'t', 'shouldn\'t', 'wouldn\'t', 'couldn\'t', 'won\'t',
        'can\'t', 'don\'t'
    ])
    stop_words = set(stopwords.words('english'))

    # Remove custom stopwords but keep important words
    tokens = [token for token in tokens if token not in stop_words or token in important_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\heyfa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\heyfa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\heyfa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Apply preprocessing to the 'text' column
df['text']=df['text'].apply(preprocess_text)
df

In [7]:
# Encode labels and aspects
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
aspect_encoder = LabelEncoder()
df['span_encoded'] = aspect_encoder.fit_transform(df['span'])

# Split the data
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)


In [6]:
# Define the dataset class
class AspectSentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['text']
        aspect = self.data.iloc[index]['span']
        inputs = self.tokenizer.encode_plus(
            aspect,
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'labels': torch.tensor(self.data.iloc[index]['label_encoded'], dtype=torch.long)
        }


In [8]:

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)



In [8]:
# Prepare datasets and dataloaders
train_dataset = AspectSentimentDataset(train_df, tokenizer, max_len=128)
val_dataset = AspectSentimentDataset(val_df, tokenizer, max_len=128)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)



In [9]:
# Training settings
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 6




In [10]:
from tqdm import tqdm

In [11]:

# # Training loop
# for epoch in range(num_epochs):
#     model.train()
#     train_loss = 0
#     train_pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs} [Train]')
#     for batch in train_loader:
#         optimizer.zero_grad()
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         token_type_ids = batch['token_type_ids'].to(device)
#         labels = batch['labels'].to(device)
#         outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
#         train_pbar.set_postfix({'loss': train_loss / (train_pbar.n + 1)})

#     # Validation
#     model.eval()
#     val_loss = 0
#     correct = 0
#     total = 0
#     val_pbar = tqdm(val_loader, desc=f'Epoch {epoch+1}/{num_epochs} [Validation]')
#     with torch.no_grad():
#         for batch in val_loader:
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             token_type_ids = batch['token_type_ids'].to(device)
#             labels = batch['labels'].to(device)
#             outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
#             val_loss += outputs.loss.item()
#             _, predicted = torch.max(outputs.logits, 1)
#             total += labels.size(0)
#             correct += (predicted == labels).sum().item()
#             val_pbar.set_postfix({'loss': val_loss / (val_pbar.n + 1), 'accuracy': 100 * correct / total})

#     print(f'Epoch {epoch+1}/{num_epochs}, Validation Accuracy: {100 * correct / total:.2f}%')



Epoch 1/6 [Train]:   0%|          | 0/40 [00:03<?, ?it/s, loss=0]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 1/6 [Train]:   0%|          | 0/40 [00:04<?, ?it/s, loss=0]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 1/6 [Train]:   0%|          | 0/40 [00:04<?, ?it/s, loss=0]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' tru

Epoch 1/6, Validation Accuracy: 86.62%




Epoch 1/6 [Train]:   0%|          | 0/40 [00:17<?, ?it/s, loss=0]


Epoch 2/6 [Train]:   0%|          | 0/40 [00:00<?, ?it/s, loss=0][A[A

Epoch 2/6 [Train]:   0%|          | 0/40 [00:00<?, ?it/s, loss=0][A[ABe aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Epoch 2/6 [Train]:   0%|          | 0/40 [00:00<?, ?it/s, loss=0][A[A

Epoch 2/6 [Train]:   0%|          | 0/40 [00:01<?, ?it/s, loss=0][A[A

Epoch 2/6 [Train]:   0%|          | 0/40 [00:01<?, ?it/s, loss=0][A[A

Epoch 2/6 [Train]:   0%|          | 0/40 [00:01<?, ?it/s, loss=0][A[A

Epoch 2/6 [Train]:   0%|          | 0/40 [00:02<?, ?it/s, loss=0][A[ABe aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if 

Epoch 2/6, Validation Accuracy: 96.82%



Epoch 3/6 [Train]:   0%|          | 0/40 [00:00<?, ?it/s][ABe aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2/6 [Train]:   0%|          | 0/40 [00:15<?, ?it/s, loss=0]

Epoch 3/6 [Train]:   0%|          | 0/40 [00:00<?, ?it/s, loss=0][ABe aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.

Epoch 3/6 [Train]:   0%|          | 0/40 [00:00<?, ?it/s, loss=0][ABe aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.

Epoch 3/6 [Train]:   0%|          | 0/40 [00:01<?, ?it/s

Epoch 3/6, Validation Accuracy: 96.82%


Epoch 3/6 [Train]:   0%|          | 0/40 [00:14<?, ?it/s, loss=0]
Epoch 4/6 [Train]:   0%|          | 0/40 [00:00<?, ?it/s, loss=0]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 4/6 [Train]:   0%|          | 0/40 [00:00<?, ?it/s, loss=0]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 4/6 [Train]:   0%|          | 0/40 [00:04<?, ?it/s, loss=0]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting

Epoch 4/6, Validation Accuracy: 98.09%




Epoch 5/6 [Train]:   0%|          | 0/40 [00:00<?, ?it/s][A[ABe aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 4/6 [Train]:   0%|          | 0/40 [00:14<?, ?it/s, loss=0]


Epoch 5/6 [Train]:   0%|          | 0/40 [00:00<?, ?it/s, loss=0][A[A

Epoch 5/6 [Train]:   0%|          | 0/40 [00:00<?, ?it/s, loss=0][A[A

Epoch 5/6 [Train]:   0%|          | 0/40 [00:00<?, ?it/s, loss=0][A[ABe aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Epoch 5/6 [Train]:   0%|          | 0/40 [00:01<?, ?it/s, loss=0][A[A

Epoch 5/6 [Train]:   0%|          | 0/40 [00:01<?, ?it/s, loss=0][A[A

Epoch 5/6 [Train]:   0%|          | 0/40 [00:02<?,

Epoch 5/6, Validation Accuracy: 95.54%



Epoch 6/6 [Train]:   0%|          | 0/40 [00:00<?, ?it/s][ABe aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 5/6 [Train]:   0%|          | 0/40 [00:14<?, ?it/s, loss=0]

Epoch 6/6 [Train]:   0%|          | 0/40 [00:00<?, ?it/s, loss=0][A
Epoch 6/6 [Train]:   0%|          | 0/40 [00:00<?, ?it/s, loss=0][A
Epoch 6/6 [Train]:   0%|          | 0/40 [00:01<?, ?it/s, loss=0][A
Epoch 6/6 [Train]:   0%|          | 0/40 [00:01<?, ?it/s, loss=0][A
Epoch 6/6 [Train]:   0%|          | 0/40 [00:01<?, ?it/s, loss=0][A
Epoch 6/6 [Train]:   0%|          | 0/40 [00:02<?, ?it/s, loss=0][A
Epoch 6/6 [Train]:   0%|          | 0/40 [00:02<?, ?it/s, loss=0][A
Epoch 6/6 [Train]:   0%|          | 0/40 [00:02<?, ?it/s, loss=0][A
Epoch 6/6 [Train]:   0%|          | 0/40 [00:03<?, ?it/s, loss=0][A
Epoch 6/6 [Train]:   0%| 

Epoch 6/6, Validation Accuracy: 97.45%


In [None]:
# Save the model
# torch.save(model.state_dict(), 'aspect_sentiment_model.pth')

In [243]:
def load_saved_model(model_path):
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
    model.load_state_dict(torch.load(model_path,map_location=torch.device('cpu')))
    model.to('cpu')
    model.eval()
    return model

In [244]:
loaded_model = load_saved_model('./10/aspect_sentiment_model.pth')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path,map_location=torch.device('cpu')))


In [247]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import re

def predict_sentiments(text, model, tokenizer, label_encoder, device):
    aspects = ['battery', 'display', 'design', 'performance', 'camera']
    results = {}

    model.eval()

    for aspect in aspects:
        # Check if the aspect is mentioned in the text
        aspect_pattern = r'\b' + re.escape(aspect) + r'\b'
        if re.search(aspect_pattern, text, re.IGNORECASE):
            # Find the sentence containing the aspect
            sentences = re.split(r'(?<=[.!?])\s+', text)
            relevant_sentence = next((s for s in sentences if re.search(aspect_pattern, s, re.IGNORECASE)), text)

            # Prepare input for the model
            inputs = tokenizer.encode_plus(
                aspect,
                relevant_sentence,
                add_special_tokens=True,
                max_length=128,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_token_type_ids=True,
                return_tensors='pt'
            )

            # Make prediction
            with torch.no_grad():
                outputs = model(
                    inputs['input_ids'].to(device),
                    attention_mask=inputs['attention_mask'].to(device),
                    token_type_ids=inputs['token_type_ids'].to(device)
                )

            _, predicted = torch.max(outputs.logits, 1)
            sentiment = label_encoder.inverse_transform([predicted.item()])[0]

            results[aspect] = sentiment

    return results

# Assuming you have already initialized these:
# model = BertForSequenceClassification.from_pretrained('path_to_your_saved_model')
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# label_encoder = LabelEncoder()  # This should be the same encoder used during training
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

# predictions = predict_sentiments(text, model, tokenizer, label_encoder, device)

# for aspect, sentiment in predictions.items():
#     print(f"Aspect: {aspect}, Sentiment: {sentiment}")

In [255]:
text = "battery doesn't last long"

predictions = predict_sentiments(text, loaded_model, tokenizer, label_encoder, device='cpu')

for aspect, sentiment in predictions.items():
    print(f"Aspect: {aspect}, Sentiment: {sentiment}")

Aspect: battery, Sentiment: negative


In [256]:
text = "battery short life"

predictions = predict_sentiments(text, loaded_model, tokenizer, label_encoder, device='cpu')

for aspect, sentiment in predictions.items():
    print(f"Aspect: {aspect}, Sentiment: {sentiment}")

Aspect: battery, Sentiment: negative


In [257]:
text = "design is very old"

predictions = predict_sentiments(text, loaded_model, tokenizer, label_encoder, device='cpu')

for aspect, sentiment in predictions.items():
    print(f"Aspect: {aspect}, Sentiment: {sentiment}")

Aspect: design, Sentiment: negative


In [258]:
text = "design is latest technology"

predictions = predict_sentiments(text, loaded_model, tokenizer, label_encoder, device='cpu')

for aspect, sentiment in predictions.items():
    print(f"Aspect: {aspect}, Sentiment: {sentiment}")

Aspect: design, Sentiment: positive


In [259]:
text = "battery is drains very fast with short battery life"

predictions = predict_sentiments(text, loaded_model, tokenizer, label_encoder, device='cpu')

for aspect, sentiment in predictions.items():
    print(f"Aspect: {aspect}, Sentiment: {sentiment}")

Aspect: battery, Sentiment: negative


In [260]:
text = "picture from rear camera is crystal clear"

predictions = predict_sentiments(text, loaded_model, tokenizer, label_encoder, device='cpu')

for aspect, sentiment in predictions.items():
    print(f"Aspect: {aspect}, Sentiment: {sentiment}")

Aspect: camera, Sentiment: positive


In [261]:
text = "picture from rear camera is blur"

predictions = predict_sentiments(text, loaded_model, tokenizer, label_encoder, device='cpu')

for aspect, sentiment in predictions.items():
    print(f"Aspect: {aspect}, Sentiment: {sentiment}")

Aspect: camera, Sentiment: negative


In [262]:
text = "it becomes too too too hot ,iam so disappointed with this brand . there is no difference between 4g and 5g . battery was weak iam charging twice a day,for basic level of usage itself."

predictions = predict_sentiments(text, loaded_model, tokenizer, label_encoder, device='cpu')

for aspect, sentiment in predictions.items():
    print(f"Aspect: {aspect}, Sentiment: {sentiment}")

Aspect: battery, Sentiment: negative


In [263]:
text = "having big one major problem it's very fast battery draining"

predictions = predict_sentiments(text, loaded_model, tokenizer, label_encoder, device='cpu')

for aspect, sentiment in predictions.items():
    print(f"Aspect: {aspect}, Sentiment: {sentiment}")

Aspect: battery, Sentiment: negative


In [264]:
text = '''Realme p1 5g is the one of the best smart phone under 15k.. camera it's ok
Battery 🔋 maximum 1days
Display is amoled so nice 👍
Design very nice
Performance.. mediatek 7050 is very powerful.. free fire 60fps....and...Bgmi.. smooth extreme..💪💪.. I Love 💗 this phone..'''

predictions = predict_sentiments(text, loaded_model, tokenizer, label_encoder, device='cpu')

for aspect, sentiment in predictions.items():
    print(f"Aspect: {aspect}, Sentiment: {sentiment}")

Aspect: battery, Sentiment: positive
Aspect: display, Sentiment: positive
Aspect: design, Sentiment: positive
Aspect: performance, Sentiment: positive
Aspect: camera, Sentiment: positive


In [265]:
text = ""

predictions = predict_sentiments(text, loaded_model, tokenizer, label_encoder, device='cpu')

for aspect, sentiment in predictions.items():
    print(f"Aspect: {aspect}, Sentiment: {sentiment}")

In [266]:
text = '''
But battery backup very poor
Atleast one day not stand by battery backup
Very fast drain the battery
Worst battery
Not go to buy this phone'''

predictions = predict_sentiments(text, loaded_model, tokenizer, label_encoder, device='cpu')

for aspect, sentiment in predictions.items():
    print(f"Aspect: {aspect}, Sentiment: {sentiment}")

Aspect: battery, Sentiment: negative


In [267]:
import spacy

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

def split_text(text):
    # First, split by newline characters
    split_by_newlines = text.split('\n')
    all_sentences = []
    
    for part in split_by_newlines:
        if part.strip():  # Ignore empty parts
            all_sentences.extend(split_on_conjunctions(part))
    
    return all_sentences

def split_on_conjunctions(text):
    doc = nlp(text)
    sentences = []
    current_sentence = []
    
    for token in doc:
        current_sentence.append(token.text)
        
        # If we encounter a coordinating conjunction or new sentence, split
        if token.dep_ == 'cc' or token.text == '.':
            sentences.append(' '.join(current_sentence).strip())
            current_sentence = []
    
    # Add the final sentence if any
    if current_sentence:
        sentences.append(' '.join(current_sentence).strip())
    
    return sentences

# Test case
text = '''Design - Awesome Looking and Dimensity 7050 5G Processor - Good  Amoled Display - Decent
Sound Experience is also Good but Camera Quality is superb'''

sentences = split_text(text)

# Output each split sentence
for idx, sentence in enumerate(sentences):
    print(f"Sentence {idx+1}: {sentence}")


Sentence 1: Design - Awesome Looking and
Sentence 2: Dimensity 7050 5 G Processor - Good   Amoled Display - Decent
Sentence 3: Sound Experience is also Good but
Sentence 4: Camera Quality is superb


In [295]:
def predict_sentences(text):
#     text = preprocess_text(text)
    
    sentences = split_text(text)
    
    rating =3
    c=0
    
    for text in sentences :
        
        print(text)
        print()
        
        predictions = predict_sentiments(text, loaded_model, tokenizer, label_encoder, device='cpu')

        for aspect, sentiment in predictions.items():
            print(f"Aspect: {aspect}, Sentiment: {sentiment}")
            
            if sentiment == 'positive':
                rating +=1
            elif sentiment == 'negative':
                rating -=1
            else:
                rating +=0.5
                
            c+=1
    print('Rating : ', rating+(5-c))

In [269]:
text = '''Design - Awesome Looking
Dimensity 7050 5G Processor - Good
Amoled Display - Decent
Sound Experience is also Good
Camera Quality - superB
Smoothly Running All Apps.'''
predict_sentences(text)

design   awesome looking
dimensity       g processor   good
amoled display   decent
sound experience is also good
camera quality   superb
smoothly running all apps 
design awesome looking dimensity g processor good amoled display decent sound experience also good camera quality superb smoothly running apps

Aspect: display, Sentiment: positive
Aspect: design, Sentiment: positive
Aspect: camera, Sentiment: positive
Rating :  5


In [270]:
text = '''it becomes too too too hot ,iam so disappointed with this brand . there is no difference between 4g and 5g . battery was weak iam charging twice a day,for basic level of usage itself.'''
predict_sentences(text)

it becomes too too too hot  iam so disappointed with this brand   there is no difference between  g and  g   battery was weak iam charging twice a day for basic level of usage itself 
becomes hot iam disappointed brand no difference g g battery weak iam charging twice day basic level usage

Aspect: battery, Sentiment: negative
Rating :  3


In [None]:
text = '''SERIOUSLY IT'S TOO GOOD.
GAME IS SMOOTH IN BGMI BASIS NO.NO.NO..IT'S OVER SMOOTH.
YOU FEEL ALWAYS COOL YOUR PHONE..CAMERA QUALITY..OHO.OHO.OHO..NICE'''
predict_sentences(text)

In [296]:
text = '''Nice phone this price range
Camera quality is not that good
Very bad display
Battery back poor'''
predict_sentences(text)

Nice phone this price range

Camera quality is not that good

Aspect: camera, Sentiment: negative
Very bad display

Aspect: display, Sentiment: negative
Battery back poor

Aspect: battery, Sentiment: negative
Rating :  2
