In [1]:
#import zipfile
#import os

# Path to your zip file
#zip_file = "Sentiment_Analysis.zip"
#extract_to = "Sentiment_Analysis"  # Folder to extract into

# Create folder if not exists
#os.makedirs(extract_to, exist_ok=True)

# Unzip the file
#with zipfile.ZipFile(zip_file, 'r') as zip_ref:
 #   zip_ref.extractall(extract_to)

#print("✅ Dataset extracted to:", extract_to)

In [2]:
# 1.Import necessary libraries
import os
import re
import glob
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import classification_report
import pickle

# Initialize pandarallel for faster processing
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

# Download stopwords from NLTK
import nltk
nltk.download('stopwords')

# Create a set of English stopwords
stop_words = set(stopwords.words('english'))

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


[nltk_data] Downloading package stopwords to C:\Users\Lakshatha
[nltk_data]     S\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
#2.Base dataset path (adjust if needed)
base_path = "Sentiment_Analysis/Sentiment_Analysis"

# Paths for train/test/unsupervised folders
train_pos = os.path.join(base_path, "train/pos")
train_neg = os.path.join(base_path, "train/neg")
train_unsup = os.path.join(base_path, "train/unsup")
test_pos = os.path.join(base_path, "test/pos")
test_neg = os.path.join(base_path, "test/neg")

# Function to load labeled data
def load_labeled_reviews(folder, label):
    """Load all reviews from folder and assign a label."""
    files = glob.glob(os.path.join(folder, "*.txt"))
    data = []
    for f in files:
        with open(f, encoding='utf-8') as file:
            data.append((file.read(), label))
    return data

# Function to load unsupervised (unlabeled) data
def load_unsupervised_data(folder):
    """Load all reviews from folder without labels."""
    files = glob.glob(os.path.join(folder, "*.txt"))
    texts = []
    for f in files:
        with open(f, encoding='utf-8') as file:
            texts.append(file.read())
    return texts

# Function to clean text
def clean_text(text):
    """Lowercase, remove HTML, punctuation, numbers, and stopwords."""
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)                
    text = re.sub(r'[^a-z\s]', '', text)             
    text = re.sub(r'\s+', ' ', text).strip()         
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    return ' '.join(tokens)



In [4]:

#3.Load all labeled and unlabeled data
print("Loading labeled train and test data...")
train_pos_data = load_labeled_reviews(train_pos, 1)
train_neg_data = load_labeled_reviews(train_neg, 0)
test_pos_data = load_labeled_reviews(test_pos, 1)
test_neg_data = load_labeled_reviews(test_neg, 0)

print("Loading unsupervised train data...")
unsup_texts = load_unsupervised_data(train_unsup)

# Combine labeled data into DataFrames
train_data = train_pos_data + train_neg_data
test_data = test_pos_data + test_neg_data

train_df = pd.DataFrame(train_data, columns=["review", "label"])
test_df = pd.DataFrame(test_data, columns=["review", "label"])

print(f"Train set size: {train_df.shape}")
print(f"Test set size: {test_df.shape}")
print(f"Unsupervised samples count: {len(unsup_texts)}")

Loading labeled train and test data...
Loading unsupervised train data...
Train set size: (25000, 2)
Test set size: (25000, 2)
Unsupervised samples count: 50000


In [8]:
# 4.Clean all datasets
# 4. Clean all datasets without pandarallel (portable version)

print("Cleaning train reviews...")
train_df['cleaned'] = train_df['review'].apply(clean_text)  # Normal apply instead of parallel

print("Cleaning test reviews...")
test_df['cleaned'] = test_df['review'].apply(clean_text)    # Normal apply

print("Cleaning unsupervised reviews...")
unsup_cleaned = [clean_text(text) for text in unsup_texts]   # List comprehension

print("Cleaning complete.")


Cleaning train reviews...
Cleaning test reviews...
Cleaning unsupervised reviews...
Cleaning complete.


In [9]:
# 5.Train Word2Vec embeddings on unsupervised data
print("Training Word2Vec model on unsupervised data...")
tokenized_unsup = [text.split() for text in unsup_cleaned]

w2v_model = Word2Vec(
    sentences=tokenized_unsup,
    vector_size=100,
    window=5,
    min_count=5,
    workers=os.cpu_count(),
    epochs=5
)

w2v_model.save("word2vec_unsup.model")
print("Word2Vec model trained and saved.")

Training Word2Vec model on unsupervised data...
Word2Vec model trained and saved.


In [10]:

# 6.Tokenizer for converting text to numeric sequences
print("Preparing tokenizer and sequences...")
all_texts = pd.concat([train_df['cleaned'], test_df['cleaned']])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_texts)

# Convert to sequences and pad
X_train_seq = tokenizer.texts_to_sequences(train_df['cleaned'])
X_test_seq = tokenizer.texts_to_sequences(test_df['cleaned'])

max_len = 200
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# Labels
y_train = train_df['label'].values
y_test = test_df['label'].values

print(f"Tokenizer vocab size: {len(tokenizer.word_index) + 1}")


Preparing tokenizer and sequences...
Tokenizer vocab size: 214479


In [11]:
# 7.Create embedding matrix from Word2Vec model
print("Creating embedding matrix...")
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, idx in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[idx] = w2v_model.wv[word]
    else:
        embedding_matrix[idx] = np.random.normal(size=(embedding_dim,))

print("Embedding matrix created.")

Creating embedding matrix...
Embedding matrix created.


In [13]:
# 8. Build a BiLSTM model for sentiment classification
print("Building BiLSTM model...")

model = Sequential([
    Embedding(input_dim=vocab_size,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              input_length=max_len,      # ✅ Added so model knows input size
              trainable=False),
    Bidirectional(LSTM(128)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Build model explicitly to show correct shapes
model.build(input_shape=(None, max_len))  # ✅ This builds the model before summary

# Show model summary
model.summary()


Building BiLSTM model...




In [14]:
# 9.Train initial model with callbacks
print("Training initial model...")
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6)

history = model.fit(
    X_train_pad,
    y_train,
    validation_split=0.1,
    epochs=10,
    batch_size=64,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

Training initial model...
Epoch 1/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 539ms/step - accuracy: 0.7426 - loss: 0.5316 - val_accuracy: 0.8552 - val_loss: 0.3219 - learning_rate: 0.0010
Epoch 2/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 521ms/step - accuracy: 0.8130 - loss: 0.4368 - val_accuracy: 0.8572 - val_loss: 0.3039 - learning_rate: 0.0010
Epoch 3/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 537ms/step - accuracy: 0.8559 - loss: 0.3445 - val_accuracy: 0.8348 - val_loss: 0.3872 - learning_rate: 0.0010
Epoch 4/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 528ms/step - accuracy: 0.8705 - loss: 0.3154 - val_accuracy: 0.8724 - val_loss: 0.3307 - learning_rate: 0.0010
Epoch 5/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 542ms/step - accuracy: 0.8830 - loss: 0.2828 - val_accuracy: 0.8688 - val_loss: 0.3422 - learning_rate: 5.0000e-04


In [15]:
# 10.Predict on all unsupervised data and use all confident predictions
print("Semi-supervised learning: predicting on unsupervised data...")

unsup_seq = tokenizer.texts_to_sequences(unsup_cleaned)
unsup_pad = pad_sequences(unsup_seq, maxlen=max_len, padding='post')

unsup_preds = model.predict(unsup_pad).flatten()

# Select all confident predictions
conf_pos_idx = np.where(unsup_preds > 0.9)[0]
conf_neg_idx = np.where(unsup_preds < 0.1)[0]

print(f"Confident positive samples found: {len(conf_pos_idx)}")
print(f"Confident negative samples found: {len(conf_neg_idx)}")

# Add confident samples to training data
new_texts = [unsup_cleaned[i] for i in np.concatenate([conf_pos_idx, conf_neg_idx])]
new_labels = [1]*len(conf_pos_idx) + [0]*len(conf_neg_idx)

new_seq = tokenizer.texts_to_sequences(new_texts)
new_pad = pad_sequences(new_seq, maxlen=max_len, padding='post')
new_labels = np.array(new_labels)

X_train_pad = np.vstack([X_train_pad, new_pad])
y_train = np.concatenate([y_train, new_labels])

print("Retraining model with extended data...")
history2 = model.fit(
    X_train_pad,
    y_train,
    validation_split=0.1,
    epochs=5,
    batch_size=64,
    callbacks=[EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)],
    verbose=1
)

Semi-supervised learning: predicting on unsupervised data...
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 78ms/step
Confident positive samples found: 9305
Confident negative samples found: 14528
Retraining model with extended data...
Epoch 1/5
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 551ms/step - accuracy: 0.9215 - loss: 0.2093 - val_accuracy: 0.9992 - val_loss: 0.0155
Epoch 2/5
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m373s[0m 543ms/step - accuracy: 0.9282 - loss: 0.1891 - val_accuracy: 0.9965 - val_loss: 0.0349
Epoch 3/5
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m374s[0m 545ms/step - accuracy: 0.9332 - loss: 0.1762 - val_accuracy: 0.9994 - val_loss: 0.0243
Epoch 4/5
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 551ms/step - accuracy: 0.9341 - loss: 0.1689 - val_accuracy: 0.9971 - val_loss: 0.0451


In [16]:
# 11.Evaluate model performance
print("Evaluating final model...")
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test accuracy: {accuracy:.4f}")

y_pred = (model.predict(X_test_pad) > 0.5).astype(int)
print(classification_report(y_test, y_pred))

Evaluating final model...
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 87ms/step - accuracy: 0.8662 - loss: 0.3436
Test accuracy: 0.8638
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 84ms/step
              precision    recall  f1-score   support

           0       0.86      0.87      0.86     12500
           1       0.87      0.86      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000



In [17]:
# 12.Save the trained model and tokenizer for future use
print("Saving model and tokenizer...")
model.save('sentiment_bilstm.h5')

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

print("Saved successfully.")




Saving model and tokenizer...
Saved successfully.


In [18]:
# 13.Load model and tokenizer (use when re-running notebook later)
print("Loading saved model and tokenizer...")
loaded_model = load_model('sentiment_bilstm.h5')

with open('tokenizer.pkl', 'rb') as f:
    loaded_tokenizer = pickle.load(f)

print("Loaded successfully.")


# Function to predict sentiment of a custom text
def predict_sentiment(text):
    cleaned = clean_text(text)
    seq = loaded_tokenizer.texts_to_sequences([cleaned])
    pad_seq = pad_sequences(seq, maxlen=max_len, padding='post')
    prediction = loaded_model.predict(pad_seq)[0][0]
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    return sentiment, prediction


Loading saved model and tokenizer...




Loaded successfully.


In [19]:
# 14.Example predictions
examples = [
    "The movie was absolutely fantastic! Great acting and story.",
    "This was the worst film I have ever seen. Total waste of time."
]

for ex in examples:
    sentiment, score = predict_sentiment(ex)
    print(f"Review: {ex}")
    print(f"Predicted Sentiment: {sentiment} (Score: {score:.4f})\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 850ms/step
Review: The movie was absolutely fantastic! Great acting and story.
Predicted Sentiment: Positive (Score: 0.9939)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
Review: This was the worst film I have ever seen. Total waste of time.
Predicted Sentiment: Negative (Score: 0.0019)

