<a href="https://colab.research.google.com/github/linzhub/oasis/blob/main/Sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader,TensorDataset

In [5]:
df = pd.read_csv("/content/Twitter_Data.csv.zip")

In [6]:
df.head()


Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [8]:
df.shape

(162980, 2)

In [9]:
df = df.iloc[:5000,:]

In [10]:
df.shape

(5000, 2)

In [11]:
df.isna().sum()


Unnamed: 0,0
clean_text,1
category,0


In [13]:
df = df.dropna()
df.isna().sum()

Unnamed: 0,0
clean_text,0
category,0


In [14]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 4999 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   clean_text  4999 non-null   object 
 1   category    4999 non-null   float64
dtypes: float64(1), object(1)
memory usage: 117.2+ KB


In [15]:
df.describe()


Unnamed: 0,category
count,4999.0
mean,0.206441
std,0.773841
min,-1.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [17]:
from nltk.corpus import stopwords
import re

In [19]:
def func(x):
    x = re.sub("[^a-zA-Z]"," ",x)
    x = x.split()
    x = [i.strip() for i in x if i not in set(stopwords.words('english'))]
    return " ".join(x)

In [22]:
df.category = df.category.astype('int')

In [24]:
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1
1,talk all the nonsense and continue all the dra...,0
2,what did just say vote for modi welcome bjp t...,1
3,asking his supporters prefix chowkidar their n...,1
4,answer who among these the most powerful world...,1


In [26]:
from collections import Counter

In [27]:
vals = df.clean_text.to_list()
texts = [i.split(" ") for i in vals]

In [29]:
def build_vocab(tokenized_data):
    all_words = [word for sentence in tokenized_data for word in sentence]
    word_counts = Counter(all_words)
    vocab = {'<PAD>': 0, '<UNK>': 1}
    vocab.update({word: idx + 2 for idx, (word, _) in enumerate(word_counts.items())})
    return vocab

In [31]:
vocab = build_vocab(texts)


In [32]:
def encode_text(tokenized_data, vocab):
    encoded_data = []
    for sentence in tokenized_data:
        encoded_sentence = [vocab.get(word, vocab['<UNK>']) for word in sentence]
        encoded_data.append(encoded_sentence)
    return encoded_data

encoded_data = encode_text(texts, vocab)
print("Encoded data:", encoded_data[0])

Encoded data: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 12, 16, 17, 18, 19, 20, 21, 22, 16, 23, 24, 25, 26, 24, 23, 27, 28, 24, 29]


In [33]:
import torch
from torch.nn.utils.rnn import pad_sequence

sequence_lengths = [len(seq) for seq in encoded_data]

encoded_tensors = [torch.tensor(seq) for seq in encoded_data]
padded_data = pad_sequence(encoded_tensors, batch_first=True, padding_value=vocab['<PAD>'])


In [35]:
target = df.category

In [37]:
class LSTMSentimentModel(nn.Module):
    def __init__(self,vocab_size,embed_dim,hid_dim,out_dim):
        super(LSTMSentimentModel,self).__init__()
        self.embedding = nn.Embedding(vocab_size,embed_dim,padding_idx=vocab['<PAD>'])
        self.lstm = nn.LSTM(embed_dim,hid_dim,batch_first=True)
        self.fc = nn.Linear(hid_dim,out_dim)

    def forward(self,text,lengths):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded,lengths,batch_first=True,enforce_sorted=False)
        packed_output,(hidden,cell) = self.lstm(packed_embedded)
        hidden = hidden[-1]
        output = self.fc(hidden)
        return output

In [39]:
labels = torch.tensor(target,dtype=torch.float)

In [40]:
lengths_tensor = torch.tensor(sequence_lengths)
dataset = TensorDataset(padded_data,lengths_tensor,labels)

In [41]:
dataloader = DataLoader(dataset,batch_size=2,shuffle=True)

In [42]:
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 1

In [43]:
model = LSTMSentimentModel(vocab_size,embedding_dim,hidden_dim,output_dim)

In [46]:
citerion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())

In [47]:
n_epochs = 5
for epoch in range(n_epochs):
    model.train()
    epoch_loss = 0
    for text_batch,lengths_batch,labels_batch in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch,lengths_batch).squeeze(1)
        loss = citerion(pred,labels_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(dataloader)
    print(f"Epoch {epoch+1} , Loss :{ avg_loss: .4f}")

Epoch 1 , Loss : 0.1602
Epoch 2 , Loss :-4.8136
Epoch 3 , Loss :-23.3084
Epoch 4 , Loss :-54.6896
Epoch 5 , Loss :-88.9462


In [48]:
text = "i look very good "
text = [vocab.get(i.lower(),vocab["<UNK>"]) for i in text.split(" ")]
sss = len(text)
text = torch.tensor(text).unsqueeze(0)


In [49]:
with torch.no_grad():
     op = model(text,lengths=[sss])


In [50]:
op = torch.sigmoid(op).round()


In [51]:
op

tensor([[1.]])