In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m60.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m74.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m87.3 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
import transformers
from transformers import BertModel, BertTokenizer
import torch

MAX_LEN = 512
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [None]:
from torch import nn
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(output[1])
    return self.out(output)

In [None]:
model = SentimentClassifier(2)
MODEL_PATH = '/content/drive/MyDrive/trained_model/bert_base_news_sentiment_v1.pth'
model.load_state_dict(torch.load(MODEL_PATH))
model = model.to(device)

In [None]:
news = "Huge scam in YES Bank for many years"

In [None]:
encoding = tokenizer.encode_plus(
      news,
      add_special_tokens=True,
      max_length=MAX_LEN,
      return_token_type_ids=False,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )


In [None]:
%%time
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)

output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)

class_names = ['NEGETIVE', 'POSITIVE']
print(f'Review text: {news}')
print(f'Sentiment  : {class_names[prediction]}')

Review text: Huge scam in YES Bank for many years
Sentiment  : NEGETIVE
CPU times: user 927 ms, sys: 100 ms, total: 1.03 s
Wall time: 1.5 s


In [None]:
prediction.item()

0

In [None]:
import time
def get_sentiment(text):
    '''
    Predicts sentiment of a text

    params:
    text:str Input text

    return:
    predicted sentiment
    '''
    encoding = tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=MAX_LEN,
      return_token_type_ids=False,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
        output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)
    # print(f'Review text: {text}', f'Sentiment  : {class_names[prediction]}')
    if prediction.item() == 0:
        return -1
    elif prediction.item() == 1:
        return 1


In [None]:
import pandas as pd
# /content/drive/MyDrive/dataset/news_dataset1.csv
news_df1 = pd.read_csv(r'/content/drive/MyDrive/dataset/news_dataset1.csv')

# convert the date column to datetime format
news_df1['Date'] = pd.to_datetime(news_df1['Date'], utc=True)
# change the datetime format
news_df1['date_formatted'] = news_df1['Date'].dt.strftime('%Y-%m-%d')

news_df1.drop(columns=['slno', 'Description', 'Date'], inplace=True)
news_df1.rename(columns = {'date_formatted':'Date'}, inplace = True)

news_df1 = news_df1.loc[news_df1['Date'] > '2007-09-16']
news_df1

Unnamed: 0,Title,Date
0,"ATMs to become virtual bank branches, accept d...",2020-05-26
1,IDFC First Bank seniors to forgo 65% of bonus ...,2020-05-26
2,"Huge scam in YES Bank for many years, says Enf...",2020-05-25
3,"Bank of Maharashtra sanctioned Rs 2,789 cr in ...",2020-05-24
4,DCB Bank's profit before tax declines 37.6% to...,2020-05-23
...,...,...
37322,SC to decide on banks' prepayment fee,2007-09-17
37323,Clients line up for funds at Northern Rock,2007-09-17
37324,Tax outgo likely to drain liquidity; Re may dip,2007-09-17
37325,ICICI Lombard bid for IOC cover raises a stink,2007-09-17


In [None]:
print(news_df1.loc[news_df1['Date'] == '2020-05-25']['Title'])

2    Huge scam in YES Bank for many years, says Enf...
Name: Title, dtype: object


In [None]:
news = [d for d in news_df1['Title']]
news_sentiment = [get_sentiment(title) for title in news]

In [None]:
news_df1['sentiment'] = [t.cpu().item() for t in news_sentiment]

In [None]:
news_df1

Unnamed: 0,Title,Date,sentiment
0,"ATMs to become virtual bank branches, accept d...",2020-05-26,1
1,IDFC First Bank seniors to forgo 65% of bonus ...,2020-05-26,1
2,"Huge scam in YES Bank for many years, says Enf...",2020-05-25,1
3,"Bank of Maharashtra sanctioned Rs 2,789 cr in ...",2020-05-24,1
4,DCB Bank's profit before tax declines 37.6% to...,2020-05-23,1
...,...,...,...
37322,SC to decide on banks' prepayment fee,2007-09-17,1
37323,Clients line up for funds at Northern Rock,2007-09-17,1
37324,Tax outgo likely to drain liquidity; Re may dip,2007-09-17,1
37325,ICICI Lombard bid for IOC cover raises a stink,2007-09-17,1


In [None]:
news_df1.to_csv(r'/content/drive/MyDrive/dataset/news_dataset1_with_sentiment.csv')