<a href="https://colab.research.google.com/github/ishankhurana27/-Delivery-Time-Predictor/blob/main/Twitter_Sentiment_(DistilBERT).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score

import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer,AutoModelForSequenceClassification

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

# Preprocessing

In [None]:
column_names=['text','target']
twitter_data=pd.read_csv('/content/Twitter_Data.csv',names=column_names)
twitter_data.head()

Unnamed: 0,text,target
0,clean_text,category
1,when modi promised “minimum government maximum...,-1
2,talk all the nonsense and continue all the dra...,0
3,what did just say vote for modi welcome bjp t...,1
4,asking his supporters prefix chowkidar their n...,1


In [None]:
twitter_data.isnull().sum()

Unnamed: 0,0
text,4
target,7


In [None]:
twitter_data=twitter_data.dropna()

In [None]:
twitter_data.isnull().sum()

Unnamed: 0,0
text,0
target,0


In [None]:
twitter_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,72249
0,55211
-1,35509
category,1


In [None]:
stemmer=PorterStemmer()

In [None]:
def stemming(text):
  text= re.sub('[^a-zA-z]',' ',text)
  text=text.lower()
  text=text.split()
  stemmed_word=[stemmer.stem(word) for word in text if word not in stopwords.words('english')]
  stemmed_word=' '.join(stemmed_word)
  return stemmed_word


In [None]:
# stemming -> takes around 6 min
twitter_data['stemmed_data']=twitter_data['text'].apply(stemming)

In [None]:
twitter_data=twitter_data.tail(-1)

In [None]:
twitter_data.to_csv("preprocessed_data.csv")

In [None]:
twitter_data=pd.read_csv("/content/preprocessed_data.csv")

In [None]:
twitter_data.head()

Unnamed: 0.1,Unnamed: 0,text,target,stemmed_data
0,1,when modi promised “minimum government maximum...,-1,modi promis minimum govern maximum govern expe...
1,2,talk all the nonsense and continue all the dra...,0,talk nonsens continu drama vote modi
2,3,what did just say vote for modi welcome bjp t...,1,say vote modi welcom bjp told rahul main campa...
3,4,asking his supporters prefix chowkidar their n...,1,ask support prefix chowkidar name modi great s...
4,5,answer who among these the most powerful world...,1,answer among power world leader today trump pu...


In [None]:
twitter_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,72249
0,55211
-1,35509


In [None]:
twitter_data=twitter_data.drop(twitter_data[twitter_data['target']==0].index)

In [None]:
twitter_data['target'].value_counts()

NameError: name 'twitter_data' is not defined

In [None]:
twitter_data.replace({'target':{-1:0}},inplace=True)

In [None]:
twitter_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,72249
0,35509


In [None]:
twitter_data.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
text,0
target,0
stemmed_data,0


In [None]:
twitter_data['stemmed_data']=twitter_data['stemmed_data'].fillna(' ')

In [None]:
twitter_data.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
text,0
target,0
stemmed_data,0


In [None]:
X=twitter_data['stemmed_data'].tolist()
y=twitter_data['target'].tolist()


In [None]:
X[0],y[0]

('modi promis minimum govern maximum govern expect begin difficult job reform state take year get justic state busi exit psu templ',
 0)

In [None]:
class customDataset(Dataset):
  def __init__(self,texts,labels,tokenizer,max_len=512):
    self.texts=texts
    self.labels=labels
    self.tokenizer=tokenizer
    self.max_len=max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, index) :
    text=str(self.texts[index])
    label=torch.tensor(self.labels[index])

    encoding=self.tokenizer(text,truncation=True,padding="max_length",max_length=self.max_len)

    return {
        'input_ids': encoding['input_ids'],
        'attention_mask': encoding['attention_mask'],
        'labels': label
    }

In [None]:
torch.__version__

'2.6.0+cu124'

In [None]:
torch.cuda.is_available()

False

In [None]:
checkpoint='distilbert-base-uncased'
device='cpu'

tokenizer= AutoTokenizer.from_pretrained(checkpoint)
model=AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2).to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
dataset=customDataset(X,y,tokenizer)

In [None]:
train_data,test_data=train_test_split(dataset,test_size=0.2,random_state=42)

In [None]:
dataset[0]

{'input_ids': [101,
  16913,
  2072,
  20877,
  2483,
  6263,
  21208,
  4555,
  21208,
  5987,
  4088,
  3697,
  3105,
  5290,
  2110,
  2202,
  2095,
  2131,
  2074,
  2594,
  2110,
  3902,
  2072,
  6164,
  8827,
  2226,
  8915,
  8737,
  2140,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [None]:
print(dataset[0].keys())
dataset[0].values()

dict_keys(['input_ids', 'attention_mask', 'labels'])


dict_values([[101, 16913, 2072, 20877, 2483, 6263, 21208, 4555, 21208, 5987, 4088, 3697, 3105, 5290, 2110, 2202, 2095, 2131, 2074, 2594, 2110, 3902, 2072, 6164, 8827, 2226, 8915, 8737, 2140, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
def compute_metrics(example):
  labels=example.label_ids
  preds=example.predictions.argmax(-1)
  f1=f1_score(labels,preds,average='weighted')
  acc=accuracy_score(labels,pred)

  return {'accuracy': acc,"f1":f1}


In [None]:
pip install transformers[torch]

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cufft_cu12

In [None]:
from transformers import Trainer, TrainingArguments
batch_size=16
model_name="distilbert-base-uncased"

args=TrainingArguments(
    output_dir='output',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=2e-5,
    num_train_epochs=1,
    eval_strategy='epoch'
)

In [None]:
trainer=Trainer(
    model=model,
    args=args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

  trainer=Trainer(


In [None]:
trainer.train()

# saving the model and downloading

In [None]:
trainer.save_model(model_name)

In [None]:
# zipping
import shutil
shutil.make_archive('model_name','zip',model_name)

In [None]:
# downloading
from google.colab import files

files.download('model_name')

In [None]:
# unzipping
import shutil
shutil.unpack_archive('model_name.zip','/content/model')

# Testing model


In [None]:
tok=AutoTokenizer.from_pretrained('/content/model')
mod=AutoModelForSequenceClassification.from_pretrained('/content/model')

NameError: name 'AutoTokenizer' is not defined

In [None]:
id2label={0:'Negetive',1:'Positive'}

In [None]:
def get_sentiment(text):
  input_ids=tok.encode(text,return_tensors='pt')
  output=mod(input_ids)

  pred=torch.nn.functional.softmax(output.logits,dim=-1)
  prob=torch.max(pred).item()
  idx=torch.argmax(pred).item()
  sentiment=id2label[idx]

  return {"sentiment":sentiment,"probability": prob}

In [None]:
get_sentiment("I will not go home but will party.")

{'sentiment': 'Positive', 'probability': 0.7758506536483765}