<a href="https://colab.research.google.com/github/kirupapremakannan/Sentimental-Analysis_HackmastersUnited/blob/main/sentimental-analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'imdb-dataset-of-50k-movie-reviews:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F134715%2F320111%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240430%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240430T094204Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D4078757e086060ae17a73dc7d297746d89b94fbd775d4b1cde7b5a138e3dfa0b1160cb56f9fccc170f265d970c00bb9b9eedf6daccd0bd1e39a0d81006c9cbe0eef89393d0f8671aa90f0f5d469baecf142dc2f9a3ca59c6726c710583b46afc5e760450ece0069423c8aafc5f3e68f3633f264b9d0aedef1443e53831113ad60bb1d7ee3825cbfd6713d5cc0b295f35500c1a1d33e1f61b1e0afdb31fbc851d41c4fe49e04753936a515821dd84e6e446640070385d565195b1c6a273131727588384670562a202bef4e526f444a4e488eeba548319de9bbc3db492285a8162fc7f7ccec5f98dcf7d181c760c0ca12900568e3f7b2dfaaa43683d982f75db97'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
reviews = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
reviews['sentiment'] = np.where(reviews['sentiment'] == 'positive', 1, 0)


In [None]:
sentences = reviews['review'].to_numpy()
labels = reviews['sentiment'].to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.25)
print("Training Data Input Shape: ", X_train.shape)
print("Training Data Output Shape: ", y_train.shape)
print("Testing Data Input Shape: ", X_test.shape)
print("Testing Data Output Shape: ", y_test.shape)

Training Data Input Shape:  (37500,)
Training Data Output Shape:  (37500,)
Testing Data Input Shape:  (12500,)
Testing Data Output Shape:  (12500,)


In [None]:
vocab_size = 10000
oov_tok = "<OOV>"
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)


In [None]:
tokenizer.fit_on_texts(X_train)
print("Number of Documents: ", tokenizer.document_count)
print("Number of Words: ", tokenizer.num_words)

Number of Documents:  37500
Number of Words:  10000


In [None]:
tokenizer.word_counts

OrderedDict([('eliza', 24),
             ('elizabeth', 318),
             ('moorman', 1),
             ('is', 158286),
             ('a', 242405),
             ('farm', 238),
             ('girl', 3884),
             ('from', 30292),
             ('the', 500649),
             ('country', 1340),
             ('coming', 1619),
             ('to', 201096),
             ('city', 1737),
             ('looking', 3795),
             ('for', 65610),
             ('love', 9636),
             ('she', 17922),
             ('has', 24778),
             ('met', 414),
             ('man', 8331),
             ('that', 102573),
             ('told', 1640),
             ('her', 25974),
             ('of', 217333),
             ('an', 32370),
             ('astrologist', 1),
             ('who', 30307),
             ('will', 13563),
             ('show', 9423),
             ('stars', 2369),
             ('this', 113215),
             ('journey', 670),
             ('souls', 196),
             ('put', 353

In [None]:

tokenizer.word_docs

defaultdict(int,
            {'things': 4460,
             'jones': 478,
             'away': 3608,
             'coming': 1506,
             'journey': 544,
             'his': 15930,
             'wretched': 91,
             'that': 29933,
             'is': 33531,
             'individuals': 190,
             'all': 19366,
             'farm': 197,
             'twisted': 301,
             'protect': 225,
             'finding': 502,
             'everyday': 246,
             'moorman': 1,
             'for': 26675,
             'weird': 844,
             'might': 3581,
             'to': 35219,
             'an': 18323,
             'beyond': 1269,
             'scare': 276,
             'or': 15457,
             'city': 1284,
             'it': 31882,
             'who': 16472,
             'when': 13446,
             'stars': 2091,
             'lee': 609,
             'her': 9398,
             'himself': 2614,
             'at': 19257,
             'full': 2370,
             'de

In [None]:
train_sequences = tokenizer.texts_to_sequences(X_train)
train_sequences[0]

[1,
 2346,
 1,
 7,
 4,
 2957,
 254,
 37,
 2,
 693,
 565,
 6,
 2,
 535,
 259,
 17,
 113,
 60,
 46,
 1886,
 4,
 128,
 13,
 46,
 560,
 41,
 5,
 33,
 1,
 36,
 80,
 116,
 41,
 2,
 401,
 12,
 7,
 4,
 1263,
 5,
 3411,
 1,
 7,
 275,
 6,
 2,
 2171,
 68,
 60,
 966,
 41,
 293,
 113,
 52,
 60,
 1095,
 89,
 39,
 22,
 2706,
 2706,
 953,
 1330,
 7,
 4,
 4615,
 3652,
 266,
 6,
 167,
 318,
 9,
 25,
 2034,
 1,
 34,
 116,
 251,
 82,
 277,
 765,
 5,
 259,
 30,
 180,
 31,
 2,
 136,
 1,
 7,
 131,
 259,
 17,
 113,
 3,
 2706,
 7,
 266,
 6,
 2958,
 2,
 1,
 13,
 25,
 4089,
 465,
 21,
 12,
 7,
 4,
 2308,
 63,
 5,
 259,
 677,
 2,
 596,
 3,
 1509,
 905,
 9,
 2,
 605,
 2750,
 180,
 2,
 399,
 7,
 5661,
 3,
 1,
 357,
 5,
 3396,
 2,
 6647,
 351,
 1,
 236,
 2449,
 244,
 2,
 838,
 1,
 19,
 2,
 2621,
 742,
 3,
 2,
 220,
 6216,
 95,
 12,
 18,
 282,
 148,
 10,
 7,
 4,
 18,
 13,
 5249,
 2,
 4976,
 5,
 2,
 937,
 5984,
 3,
 8506,
 3293,
 12,
 28,
 1,
 37,
 2,
 2670,
 19,
 415,
 282,
 148]

In [None]:
sequence_length = 200
train_padded = pad_sequences(train_sequences, maxlen=sequence_length, padding='post', truncating='post')


In [None]:
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, maxlen=sequence_length, padding='post', truncating='post')


In [None]:
embedding_dim = 16
lstm_out = 32
model = Sequential()

In [None]:
model.add(Embedding(vocab_size, embedding_dim, input_length=sequence_length))

In [None]:
model.add(Bidirectional(LSTM(lstm_out)))

In [None]:
model.add(Dense(10, activation='relu'))

In [None]:
model.add(Dense(1,activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy',optimizer ='adam',metrics = ['accuracy'])

In [None]:
model.summary()

In [None]:
checkpoint_filepath = os.getcwd()
model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath, save_weights_only=False, monitor='val_loss', mode='min', save_best_only=True)
callbacks = [EarlyStopping(patience=2), model_checkpoint_callback]

ValueError: The filepath provided must end in `.keras` (Keras model format). Received: filepath=/kaggle/working