<a href="https://colab.research.google.com/github/junaidp/InternalControlServer/blob/master/Sentimental_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
import re
import nltk
import os
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from datasets import Dataset
import torch
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from google.colab import drive
from sklearn.metrics import accuracy_score, f1_score

In [9]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Uploading data

In [10]:
file_path = '/content/movie.csv'
data = pd.read_csv(file_path)

texts = data['text']
labels = data['label']

Cleaning texts

In [11]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

texts = texts.apply(clean_text)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

print("Training set size:", len(train_texts))
print("Testing set size:", len(test_texts))

KeyboardInterrupt: 

Tokenizing the texts

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_text(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')


train_encodings = [tokenize_text(text) for text in train_texts]
test_encodings = [tokenize_text(text) for text in test_texts]

Using the Hugging Face datasets library to create a Dataset object

In [None]:
train_dataset = Dataset.from_dict({
    'input_ids': [encoding['input_ids'].squeeze().tolist() for encoding in train_encodings],
    'attention_mask': [encoding['attention_mask'].squeeze().tolist() for encoding in train_encodings],
    'labels': train_labels.tolist()
})

test_dataset = Dataset.from_dict({
    'input_ids': [encoding['input_ids'].squeeze().tolist() for encoding in test_encodings],
    'attention_mask': [encoding['attention_mask'].squeeze().tolist() for encoding in test_encodings],
    'labels': test_labels.tolist()
})

Define the Model

In [None]:

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Training the Model

In [None]:
os.environ['WANDB_DISABLED'] = 'true'
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {
        'eval_accuracy': accuracy,
        'eval_f1': f1,
    }


training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=250,
    evaluation_strategy="steps",
    save_steps=1000,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Evaluating results

In [None]:
results = trainer.evaluate()
print(results)

Testing the model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

inputs = tokenizer("movie was bad!", return_tensors="pt", padding=True, truncation=True).to(device)

outputs = model(**inputs)
logits = outputs.logits

predicted_class = logits.argmax(-1)
print(predicted_class.item())