<a href="https://colab.research.google.com/github/keerthanab2201/Sentiment-Analysis-using-Deep-Learning/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##CNN-LSTM Model

##Data Collection

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load csv file and preview
import pandas
df= pd.read_csv("/content/drive/MyDrive/datasets/Amazon-Product-Reviews-Sentiment-Analysis-in-Python-Dataset.csv")
print(df.head())

In [None]:
# Save as a JSON file(records format)
df.to_json("amazon_reviews_data.json", orient="records", lines=True)
print("✅ Conversion complete: Saved as reviews_data.json")

##Data Pre-Processing

* lowercase
* stopword removal
* punctuation removal
* one word review removal
* contraction removal
* tokenization
* part of speech tagging

In [None]:
# installing dependencies
!pip install contractions textblob gensim beautifulsoup4
!python -m textblob.download_corpora

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

In [None]:
# Step 2: Import Modules
import re
import pandas as pd
import nltk
import numpy as np
import gensim
from bs4 import BeautifulSoup
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from contractions import fix as expand_contractions
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# load JSON dataset and inspect columns
df = pd.read_json("amazon_reviews_data.json", lines=True)
print(df.columns)

In [None]:
# Drop missing values and filter required columns
text_col = "Review"
label_col = "Sentiment"
df = df[[text_col, label_col]].dropna() #these are the two columns
df.columns = ["text", "rating"]  # Normalize column names

In [None]:
# Define preprocessing function
def preprocess_pipeline(text):
    text = str(text)

    # 1. Lowercasing
    text = text.lower()

    # 2. Stopword Removal
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]

    # 3. Remove Punctuation
    tokens = [re.sub(r"[^\w\s]", "", t) for t in tokens]
    tokens = [t for t in tokens if t.strip() != ""]

    # Rejoin tokens for further steps
    text = " ".join(tokens)

    # 4. Remove One-Word Reviews
    if len(tokens) <= 1:
        return None

    # 5. Contraction Removal
    text = expand_contractions(text)

    # 6. Tokenization (again)
    tokens = word_tokenize(text)

    # 7. Part-of-Speech (POS) Tagging
    pos_tags = nltk.pos_tag(tokens)

    # 8. Score Generation using TextBlob Sentiment
    polarity_score = TextBlob(text).sentiment.polarity  # -1 to 1

    return {
        "clean_text": text,
        "tokens": tokens,
        "pos_tags": pos_tags,
        "score": polarity_score
    }

In [None]:
# Apply preprocessing function
processed = df["text"].apply(preprocess_pipeline)
df = df[processed.notnull()].copy()
df["processed"] = processed[processed.notnull()].values

In [None]:
# Extract cleaned data for tokenization
texts = df["processed"].apply(lambda x: x["clean_text"]).tolist()
labels = df["rating"].tolist()
scores = df["processed"].apply(lambda x: x["score"]).tolist()
# Result: Three lists containing the text data, labels, and scores respectively

In [None]:
# 9. Word Embeddings
# Keras Tokenizer- converts raw text into numerical sequences (each word= unique integer index) that can be later processed by Keras layers like Embedding
MAX_VOCAB = 10000
tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="")
tokenizer.fit_on_texts(texts)
# sequence padding- adding placeholder values (often zeros) to shorter sequences in a dataset to make them all the same length
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
padded_sequences = pad_sequences(sequences, maxlen=200, padding='post')

In [None]:
import zipfile

with zipfile.ZipFile("/content/drive/MyDrive/glove.6B.zip", "r") as zip_ref:
    zip_ref.extractall("glove")

In [None]:
# Load GloVe and Create Embedding Matrix
#GloVe(Global Vectors for Word Representation)- converts words into numerical vectors(embeddings) that capture semantic relationships between words- unsupervised learning algorithm

embedding_index = {}
with open("glove/glove.6B.100d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")[:50]  # ← Truncate to 50D
        embedding_index[word] = coefs

embedding_dim = 50  # matching model spec
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector