In [None]:
!pip install transformers scikit-learn pandas numpy matplotlib seaborn nltk gensim pyLDAvis



In [None]:
# Install necessary packages
!pip install transformers datasets tqdm



In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score


file_path = '/content/All_Beauty.jsonl'
reviews_df = pd.read_json(file_path, lines=True)


reviews_df = reviews_df.sample(n=3000, random_state=42)


print(reviews_df.head())


model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=3)

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=512
    )


train_df, test_df = train_test_split(reviews_df, test_size=0.2, random_state=42)


def rating_to_sentiment(rating):
    if rating >= 4:
        return 2  # Positive
    elif rating == 3:
        return 1  # Neutral
    else:
        return 0  # Negative

train_df['sentiment'] = train_df['rating'].apply(rating_to_sentiment)
test_df['sentiment'] = test_df['rating'].apply(rating_to_sentiment)


class ReviewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_encodings = tokenizer(list(train_df['text']), padding=True, truncation=True, max_length=512, return_tensors="pt")
test_encodings = tokenizer(list(test_df['text']), padding=True, truncation=True, max_length=512, return_tensors="pt")


train_dataset = ReviewsDataset(train_encodings, train_df['sentiment'].tolist())
test_dataset = ReviewsDataset(test_encodings, test_df['sentiment'].tolist())


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=200,
    learning_rate=3e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    report_to="none"
)


def compute_metrics(p):
    predictions, labels = p
    predictions = torch.tensor(predictions).to(device)
    labels = torch.tensor(labels).to(device)
    predictions = torch.argmax(predictions, axis=-1)
    accuracy = accuracy_score(labels.cpu(), predictions.cpu())
    return {"eval_accuracy": accuracy}


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

model.save_pretrained('./sentiment_model')
tokenizer.save_pretrained('./sentiment_model')


In [None]:
import shutil

# Compress the model directory into a .zip file
shutil.make_archive('/content/sentiment_model', 'zip', './sentiment_model')

# Download the .zip file
from google.colab import files
files.download('/content/sentiment_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
def predict_sentiment(input_text):

    encoding = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True)


    encoding = {key: val.to(device) for key, val in encoding.items()}


    with torch.no_grad():
        outputs = model(**encoding)


    logits = outputs.logits


    predicted_class = torch.argmax(logits, dim=-1).item()


    sentiment = ['Negative', 'Neutral', 'Positive'][predicted_class]
    return sentiment

# Example usage:
input_text = "This is a excellent product"
predicted_sentiment = predict_sentiment(input_text)
print(f"Sentiment of input text: {predicted_sentiment}")
