<a href="https://colab.research.google.com/github/harshitneverdebugs/Sentiment-Analysis/blob/main/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pip install tensorflow

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from numpy import array

from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dropout, Dense, Flatten, GlobalMaxPooling1D, Embedding, Conv1D, LSTM
from sklearn.model_selection import train_test_split

In [None]:
df=pd.read_csv("/content/drive/MyDrive/IMDB Dataset.csv")

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
import seaborn as sns
sns.countplot(x='sentiment', data=df)

In [None]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    '''Removes HTML tags: replaces anything between opening and closing <> with empty space'''
    return TAG_RE.sub('', text)

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
def preprocess_text(sen):

    sentence = sen.lower()
    sentence = remove_tags(sentence)
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)

    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
    sentence = pattern.sub('', sentence)

    return sentence


In [None]:
X=[]
sentences=list(df['review'])
for sen in sentences:
  X.append(preprocess_text(sen))

In [None]:
X[5]

In [None]:
Y = df['sentiment']
Y = np.array(list(map(lambda x: 1 if x=="positive" else 0, Y)))

In [None]:
X_train, X_test, Y_train, Y_test=train_test_split(X,Y, test_size=0.2, random_state=42)

In [None]:
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)

X_train = word_tokenizer.texts_to_sequences(X_train)
X_test = word_tokenizer.texts_to_sequences(X_test)

In [None]:
vocab_length = len(word_tokenizer.word_index) + 1
vocab_length

In [None]:
maxlen=100
X_train=pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test=pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!unzip glove*.zip

In [None]:
!ls
!pwd

In [None]:
from numpy import asarray
from numpy import zeros
embeddings_dictionary=dict()
glove_file = open('glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
  records=line.split()
  word=records[0]
  vector_dimensions=asarray(records[1:], dtype='float32')
  embeddings_dictionary[word]=vector_dimensions
glove_file.close()

In [None]:
embedding_matrix=zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
  embedding_vector=embeddings_dictionary.get(word)
  if embedding_vector is not None:
    embedding_matrix[index]=embedding_vector

In [None]:
embedding_matrix.shape

Simple Neural Network

In [None]:
snn_model = Sequential()
embedding_layer = Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
snn_model.add(embedding_layer)
snn_model.add(Flatten())
snn_model.add(Dense(1, activation='sigmoid'))

In [None]:
snn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(snn_model.summary())

In [None]:
snn_model_history=snn_model.fit(X_train, Y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

In [None]:
score=snn_model.evaluate(X_test, Y_test, verbose=1)

In [None]:
print("Test score:", score[0])
print("Test accuracy:", score[1])

In [None]:
import matplotlib.pyplot as plt

plt.plot(snn_model_history.history['acc'])
plt.plot(snn_model_history.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(snn_model_history.history['loss'])
plt.plot(snn_model_history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

Convolutional Neural Network

In [None]:
from keras.layers import Conv1D

In [None]:
cnn_model = Sequential()
embedding_layer = Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
cnn_model.add(embedding_layer)
cnn_model.add(Conv1D(128, 5, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(1, activation='sigmoid'))

In [None]:
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(cnn_model.summary())

In [None]:
cnn_model_history = cnn_model.fit(X_train, Y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

In [None]:
score = cnn_model.evaluate(X_test, Y_test, verbose=1)

In [None]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

In [None]:
import matplotlib.pyplot as plt

plt.plot(cnn_model_history.history['acc'])
plt.plot(cnn_model_history.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc = 'upper left')
plt.show()

plt.plot(cnn_model_history.history['loss'])
plt.plot(cnn_model_history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc = 'upper left')
plt.show()

Reccurent Neural Network(LSTM)

In [None]:
lstm_model = Sequential()
embedding_layer = Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
lstm_model.add(embedding_layer)
lstm_model.add(LSTM(128))
lstm_model.add(Dense(1, activation='sigmoid'))

In [None]:
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(lstm_model.summary())

In [None]:
lstm_model_history = lstm_model.fit(X_train, Y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

In [None]:
score = lstm_model.evaluate(X_test, Y_test, verbose=1)

In [None]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

In [None]:
import matplotlib.pyplot as plt

plt.plot(lstm_model_history.history['acc'])
plt.plot(lstm_model_history.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(lstm_model_history.history['loss'])
plt.plot(lstm_model_history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()