<a href="https://colab.research.google.com/github/hitz02/hitz02.github.io/blob/master/Sentiment_Analysis_Using_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [0]:
# !pip install transformers

In [0]:
df = pd.read_csv('K8_Reviews.csv')

In [0]:
df.head()

Unnamed: 0,sentiment,review
0,1,Good but need updates and improvements
1,0,"Worst mobile i have bought ever, Battery is dr..."
2,1,when I will get my 10% cash back.... its alrea...
3,1,Good
4,0,The worst phone everThey have changed the last...


In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14675 entries, 0 to 14674
Data columns (total 2 columns):
sentiment    14675 non-null int64
review       14675 non-null object
dtypes: int64(1), object(1)
memory usage: 229.4+ KB


In [0]:
batch_1 = df[:2000]

In [0]:
batch_1['sentiment'].value_counts()

0    1053
1     947
Name: sentiment, dtype: int64

In [0]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel,ppb.DistilBertTokenizer,'distilbert-base-uncased')

In [0]:
#load pretrained weights/tokenizer

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [0]:
tokenized = batch_1['review'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [0]:
tok_val = [i[:200] for i in tokenized.values]

In [0]:
max_len = 200

# for i in tokenized.values:
#   if len(i) > max_len:
#     max_len = len(i)

In [0]:
max_len

200

In [0]:
#Padding
padded = np.array([i + [0]*(max_len - len(i)) for i in tok_val])

In [0]:
padded.shape

(2000, 200)

In [0]:
padded[0]

array([  101,  2204,  2021,  2342, 14409,  1998,  8377,   102,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [0]:
#Masking

attention_mask = np.where(padded != 0,1,0)

In [0]:
attention_mask.shape

(2000, 200)

In [0]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

In [0]:
with torch.no_grad():
  last_hidden_states = model(input_ids,attention_mask = attention_mask)

In [0]:
last_hidden_states[0].shape

torch.Size([2000, 200, 768])

In [0]:
features = last_hidden_states[0][:,0,:].numpy()

In [0]:
features.shape

(2000, 768)

In [0]:
labels = batch_1['sentiment']

In [0]:
X_train,X_test,y_train,y_test = train_test_split(features,labels)

In [0]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
lr_clf.score(X_test,y_test)

0.862

In [0]:
from sklearn.dummy import DummyClassifier

In [0]:
clf = DummyClassifier()

In [0]:
scores = cross_val_score(clf,X_train,y_train)

In [0]:
scores.mean()

0.5106666666666667

In [0]:
scores.std()*2

0.022231109334044107