In [2]:
! pip install nltk
! pip install scikit-learn



In [3]:
import numpy as np
import pandas as pd

In [50]:
sentiment_data = pd.read_csv('/content/drive/MyDrive/NLP_Theory_DA/SentimentAnalysisDataset.csv', on_bad_lines='skip')

In [5]:
sentiment_data.head()

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,is so sad for my APL frie...
1,2,0,Sentiment140,I missed the New Moon trail...
2,3,1,Sentiment140,omg its already 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...


In [6]:
len(sentiment_data)

1578612

In [75]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
import nltk

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
X = sentiment_data['SentimentText']
y = sentiment_data['Sentiment']

# Train-Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 32)

# Data Pre-Processing (Removing Stop Words, and Vectorization of Data)

In [13]:
sw = set(stopwords.words('english'))

In [14]:
tokenize = WordPunctTokenizer()

In [15]:
def formatting_textual_details(text):
    wd = tokenize.tokenize(text)
    wd = [w.lower() for w in wd if w.isalpha() and w.lower() not in sw]
    return ' '.join(wd)

In [20]:
X_train = X_train.apply(formatting_textual_details)
X_test = X_test.apply(formatting_textual_details)

In [21]:
vectorize = TfidfVectorizer()
X_train_vectorized = vectorize.fit_transform(X_train)
X_test_vectorized = vectorize.transform(X_test)

# Naive Bayes Classifier

In [22]:
NB_Model = MultinomialNB()
NB_Model.fit(X_train_vectorized, y_train)

In [23]:
test_sentiment = 'It is a good day!'
test_sentiment_data = {'Sentiment': [test_sentiment], 'Value': [1]}
test_sentiment_df = pd.DataFrame(test_sentiment_data)
x = test_sentiment_df['Sentiment']
test_sentiment1 = x.apply(formatting_textual_details)
test_sentiment2 = vectorize.transform(test_sentiment1)
test_sentiment_pred = NB_Model.predict(test_sentiment2)
print(test_sentiment_pred[0])

1


In [24]:
test_sentiment = 'It is a bad day!'
test_sentiment_data = {'Sentiment': [test_sentiment], 'Value': [1]}
test_sentiment_df = pd.DataFrame(test_sentiment_data)
x = test_sentiment_df['Sentiment']
test_sentiment1 = x.apply(formatting_textual_details)
test_sentiment2 = vectorize.transform(test_sentiment1)
test_sentiment_pred = NB_Model.predict(test_sentiment2)
print(test_sentiment_pred[0])

0


In [25]:
y_pred = NB_Model.predict(X_test_vectorized)

In [26]:
accuracy = accuracy_score(y_test, y_pred)

In [27]:
print("Naive Bayes Accuracy is: ", accuracy * 100)

Naive Bayes Accuracy is:  76.09991046994831


In [28]:
cm = confusion_matrix(y_test, y_pred)

In [29]:
print("Naive Bayes Confusion Matrix: ")
print(cm)

Naive Bayes Confusion Matrix: 
[[185584  50615]
 [ 62572 174813]]


In [30]:
classification_report = classification_report(y_test, y_pred)

In [31]:
print("Naive Bayes Classification Report is: ")
print(classification_report)

Naive Bayes Classification Report is: 
              precision    recall  f1-score   support

           0       0.75      0.79      0.77    236199
           1       0.78      0.74      0.76    237385

    accuracy                           0.76    473584
   macro avg       0.76      0.76      0.76    473584
weighted avg       0.76      0.76      0.76    473584



# SVM Classifier

In [32]:
from sklearn.svm import SVC

In [36]:
SVM_Model = SVC(kernel='rbf')
SVM_Model.fit(X_train_vectorized, y_train)

In [37]:
test_sentiment = 'It is a good day!'
test_sentiment_data = {'Sentiment': [test_sentiment], 'Value': [1]}
test_sentiment_df = pd.DataFrame(test_sentiment_data)
x = test_sentiment_df['Sentiment']
test_sentiment1 = x.apply(formatting_textual_details)
test_sentiment2 = vectorize.transform(test_sentiment1)
test_sentiment_pred = SVM_Model.predict(test_sentiment2)
print(test_sentiment_pred[0])

1


In [38]:
test_sentiment = 'It is a bad day!'
test_sentiment_data = {'Sentiment': [test_sentiment], 'Value': [1]}
test_sentiment_df = pd.DataFrame(test_sentiment_data)
x = test_sentiment_df['Sentiment']
test_sentiment1 = x.apply(formatting_textual_details)
test_sentiment2 = vectorize.transform(test_sentiment1)
test_sentiment_pred = SVM_Model.predict(test_sentiment2)
print(test_sentiment_pred[0])

0


In [40]:
y_pred = SVM_Model.predict(X_test_vectorized)

In [41]:
accuracy = accuracy_score(y_test, y_pred)

In [42]:
print("SVM Accuracy is: ", accuracy * 100)

SVM Accuracy is:  72.0663821900211


In [43]:
cm = confusion_matrix(y_test, y_pred)

In [44]:
print("SVM Confusion Matrix: ")
print(cm)

SVM Confusion Matrix: 
[[170482  62491]
 [ 69846 170765]]


In [46]:
classification_report = classification_report(y_test, y_pred)

In [47]:
print("SVM Classification Report is: ")
print(classification_report)

SVM Classification Report is: 
              precision    recall  f1-score   support

            0      0.71      0.73      0.72    232973
            1      0.73      0.71      0.72    240611

     accuracy                          0.72    473584
    macro avg      0.72      0.72      0.72    473584
 weighted avg      0.72      0.72      0.72    473584


# Bi-LSTM Classifier

In [48]:
! pip install tensorflow



In [49]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize

In [51]:
X = sentiment_data['SentimentText']
y = sentiment_data['Sentiment']

In [52]:
le = LabelEncoder()
y = le.fit_transform(y)

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 32)

In [54]:
sw = set(stopwords.words('english'))

In [55]:
def formatting_textual_details(text):
    wd = word_tokenize(text)
    wd = [w.lower() for w in wd if w.isalpha() and w.lower() not in sw]
    return ' '.join(wd)

In [56]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [57]:
X_train = X_train.apply(formatting_textual_details)
X_test = X_test.apply(formatting_textual_details)

In [58]:
tokenizer = Tokenizer(num_words = 10000)
tokenizer.fit_on_texts(X_train)

In [59]:
X_train_sequential = tokenizer.texts_to_sequences(X_train)
X_test_sequential = tokenizer.texts_to_sequences(X_test)

In [60]:
X_train_padded = pad_sequences(X_train_sequential, maxlen = 50, padding = 'post')
X_test_padded = pad_sequences(X_test_sequential, maxlen = 50, padding = 'post')

In [61]:
BiLSTM_Model = Sequential()
BiLSTM_Model.add(Embedding(input_dim = 10000, output_dim = 100, input_length = 50))
BiLSTM_Model.add(Bidirectional(LSTM(64, return_sequences = True)))
BiLSTM_Model.add(Bidirectional(LSTM(32)))
BiLSTM_Model.add(Dense(1, activation = 'sigmoid'))

In [62]:
BiLSTM_Model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [63]:
BiLSTM_Model.fit(X_train_padded, y_train, epochs = 5, batch_size = 64, validation_split = 0.3)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7acf3c2fe3b0>

In [68]:
test_sentiment = 'It is a good day!'
test_sentiment_data = {'Sentiment': [test_sentiment], 'Value': [1]}
test_sentiment_df = pd.DataFrame(test_sentiment_data)
x = test_sentiment_df['Sentiment']
test_sentiment1 = x.apply(formatting_textual_details)
test_sentiment2 = tokenizer.texts_to_sequences(test_sentiment1)
test_sentiment_pred = BiLSTM_Model.predict(test_sentiment2)
print(test_sentiment_pred[0].round())

1


In [69]:
test_sentiment = 'It is a bad day!'
test_sentiment_data = {'Sentiment': [test_sentiment], 'Value': [1]}
test_sentiment_df = pd.DataFrame(test_sentiment_data)
x = test_sentiment_df['Sentiment']
test_sentiment1 = x.apply(formatting_textual_details)
test_sentiment2 = vectorize.transform(test_sentiment1)
test_sentiment_pred = BiLSTM_Model.predict(test_sentiment2)
print(test_sentiment_pred[0],round())

0


In [70]:
y_pred = BiLSTM_Model.predict(X_test_padded)



In [71]:
accuracy = accuracy_score(y_test, y_pred.round())

In [72]:
print("Bi-LSTM Accuracy is: ", accuracy * 100)

Bi-LSTM Accuracy is:  78.29571944998142


In [73]:
cm = confusion_matrix(y_test, y_pred.round())

In [74]:
print("Bi-LSTM Confusion Matrix: ")
print(cm)

Bi-LSTM Confusion Matrix: 
[[184891  51308]
 [ 51480 185905]]


In [76]:
classification_report = classification_report(y_test, y_pred.round())

In [77]:
print("Bi-LSTM Classification Report is: ")
print(classification_report)

Bi-LSTM Classification Report is: 
              precision    recall  f1-score   support

           0       0.78      0.78      0.78    236199
           1       0.78      0.78      0.78    237385

    accuracy                           0.78    473584
   macro avg       0.78      0.78      0.78    473584
weighted avg       0.78      0.78      0.78    473584

