## About

This is NLP project for ISY course.

## Dataset

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

nltk.download('stopwords')
nltk.download('punkt')

df = pd.read_csv('/content/AmazonReview.csv')

# Remove missing values
df = df.dropna()

# Map sentiment scores to 'positive', 'neutral', 'negative'
df['Sentiment'] = np.where(df['Sentiment'] >= 4, 'positive', np.where(df['Sentiment'] == 3, 'neutral', 'negative'))

# Convert sentiment to categorical labels
label_encoder = LabelEncoder()
df['Sentiment'] = label_encoder.fit_transform(df['Sentiment'])

# Remove stopwords and perform preprocessing
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing to the Review column
df['Review'] = df['Review'].apply(preprocess_text)

# Split the dataset into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(
    df['Review'], df['Sentiment'], test_size=0.2, random_state=42
)

# Tokenize and pad sequences
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data)

train_sequences = tokenizer.texts_to_sequences(train_data)
test_sequences = tokenizer.texts_to_sequences(test_data)

train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post', truncating='post')

# Build a neural network model
model = keras.Sequential([
    layers.Embedding(max_words, 100, input_length=max_len),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
    layers.Bidirectional(layers.LSTM(64)),
    layers.Dense(64, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_padded, train_labels, epochs=10, validation_split=0.2)

# Evaluate the model
test_loss, test_acc = model.evaluate(test_padded, test_labels)
print(f"\nTest Accuracy: {test_acc * 100:.2f}%")

# Make predictions on sample sentences
sample_sentences = ["This product is amazing!", "I don't like it at all.", "It's neither great nor terrible."]
sample_sentences = [preprocess_text(sentence) for sentence in sample_sentences]
sample_sequences = tokenizer.texts_to_sequences(sample_sentences)
sample_padded = pad_sequences(sample_sequences, maxlen=max_len, padding='post', truncating='post')

predictions = model.predict(sample_padded)
predicted_labels = np.argmax(predictions, axis=1)

# Decode predicted labels back to sentiment classes
predicted_sentiments = label_encoder.inverse_transform(predicted_labels)

# Display the results
for sentence, sentiment in zip(sample_sentences, predicted_sentiments):
    print(f"Review: {sentence} | Predicted Sentiment: {sentiment}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Test Accuracy: 64.08%
Review: product amazing | Predicted Sentiment: positive
Review: dont like | Predicted Sentiment: negative
Review: neither great terrible | Predicted Sentiment: negative


In [None]:
# all data preprocessing should be done

## Model

In [None]:
# Create a Neural Network

## Train Model

In [None]:
# Train your model here

## Results and Analysis

In [None]:
# Use diagrams and reasonings for analysis