In [None]:
import pandas as pd
import numpy as np
import re
import unidecode
import string
import contractions
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense

In [None]:
# Load the dataset
df = pd.read_csv('Reviews.csv')

# Drop rows with missing 'Score' values
df = df.dropna(subset=['Score'])

# Filter only positive (4, 5) and negative (1, 2) reviews, excluding neutral reviews (3 stars)
filtered_data = df[df['Score'].isin([1, 2, 4, 5])]

# Drop duplicates to ensure unique Product IDs
unique_products = filtered_data.drop_duplicates(subset=['ProductId'])

# Randomly sample 20,000 rows from the filtered dataset
sampled_data = unique_products.sample(n=20000, random_state=42)

# Save the sampled dataset to a new CSV file (optional)
sampled_data.to_csv('filtered_reviews_20000.csv', index=False)

print(f"Dataset created with {len(sampled_data)} reviews from unique products.")
print(sampled_data.head())

In [None]:
# Count negative reviews (Score = 1 or 2)
negative_count = sampled_data[sampled_data['Score'].isin([1, 2])].shape[0]
print(f"Number of negative reviews: {negative_count}")

# Count positive reviews (Score = 4 or 5)
positive_count = sampled_data[sampled_data['Score'].isin([4, 5])].shape[0]
print(f"Number of positive reviews: {positive_count}")

# Verify the total number of rows
total_reviews = sampled_data.shape[0]
print(f"Total reviews in the dataset: {total_reviews}")
print(f"Sum of positive and negative reviews: {negative_count + positive_count}")

In [None]:
# Save the sampled dataset to the project directory (e.g., 'data/' folder)
sampled_data.to_csv('filtered_reviews_20000.csv', index=False)

In [None]:
# Create a new DataFrame with the required structure
updated_data = sampled_data[['Text', 'Score']].copy()

# Add a new 'id' column with incremental values
updated_data.insert(0, 'id', range(1, len(updated_data) + 1))

# Convert scores: 4 and 5 to 1, 1 and 2 to 0
updated_data['Score'] = updated_data['Score'].map({4: 1, 5: 1, 1: 0, 2: 0})

# Rename the 'Text' column to 'comments'
updated_data.rename(columns={'Text': 'Reviews'}, inplace=True)

# Save the updated dataset to a new file (optional)
updated_data.to_csv('updated_reviews.csv', index=False)

# Display the first few rows
print(updated_data.head())

In [None]:
# Create a copy of the dataset
preprocessed_data = updated_data.copy()

# Define a spelling correction mapping dictionary
spelling_correction_mapping = contraction_mapping = {
    "ive": "i've", "dont": "don't", "cant": "can't", "wont": "won't", "im": "i'm", "youre": "you're",
    "theyre": "they're", "isnt": "isn't", "arent": "aren't", "wasnt": "wasn't", "werent": "weren't",
    "havent": "haven't", "hasnt": "hasn't", "hadnt": "hadn't", "wouldnt": "wouldn't", "doesnt": "doesn't",
    "didnt": "didn't", "couldnt": "couldn't", "shouldnt": "shouldn't", "mightnt": "mightn't", "mustnt": "mustn't",
    "whos": "who's", "whats": "what's", "wheres": "where's", "whens": "when's", "hows": "how's", "ill": "i'll",
    "youll": "you'll", "hell": "he'll", "theyll": "they'll", "itll": "it'll", "thatll": "that'll", "youd": "you'd",
    "hed": "he'd", "theyd": "they'd", "thatd": "that'd", "youve": "you've", "weve": "we've", "theyve": "they've",
    "shouldve": "should've", "couldve": "could've", "lets": "let's", "aint": "ain't"
}


# Define a combined preprocessing function
def preprocess_text(text):

    # Step 1: Text Normalization
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()

    # Step 2: Correct Spelling Errors
    words = text.split()
    corrected_words = [spelling_correction_mapping.get(word, word) for word in words]
    text = " ".join(corrected_words)
    
    # Step 3: Expand Contractions using the contractions library
    text = contractions.fix(text)
    
    # Step 4: Remove Emails
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', '', text)
    
    # Step 5: Remove HTML Tags
    text = re.sub(r'<.*?>', '', text)
    
    # Step 6: Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Step 7: Handle Accented Characters
    text = unidecode.unidecode(text)
        
    return text

# Apply preprocessing function to overwrite the Reviews column
preprocessed_data['Reviews'] = preprocessed_data['Reviews'].apply(preprocess_text)

# Save the final preprocessed dataset
preprocessed_data.to_csv('preprocessed_data.csv', index=False)

print("Preprocessing completed. Dataset saved as 'preprocessed_data.csv'.")


In [None]:
rating = preprocessed_data['Score'].values.tolist()
review = preprocessed_data['Reviews'].values.tolist()

In [None]:
portion = int(len(rating)*0.8)

review_train = review[:portion]
review_test = review[portion:]
rating_train = rating[:portion]
rating_test = rating[portion:]

In [273]:
# Sample reviews
review2 = preprocessed_data['Reviews'].tolist()

# Tokenizer without num_words to analyze coverage
tokenizer = Tokenizer()
tokenizer.fit_on_texts(review2)

# Analyze word coverage
word_counts = tokenizer.word_counts
sorted_counts = sorted(word_counts.values(), reverse=True)
cumulative_coverage = np.cumsum(sorted_counts) / sum(sorted_counts)

# Determine the number of words covering 95% of the dataset
num_words_95 = np.argmax(cumulative_coverage >= 0.95) + 1
print(f"Number of words covering 95% of the dataset: {num_words_95}")

# Calculate cumulative coverage
total_tokens = sum(sorted_counts)  # Total number of tokens in the dataset
top_4500_coverage = sum(sorted_counts[:4500]) / total_tokens * 100

print(f"The top 4500 words cover {top_4500_coverage:.2f}% of the total tokens.")

Number of words covering 95% of the dataset: 4792
The top 4500 words cover 94.69% of the total tokens.


In [323]:
# Tokenize the text data for training
num_words = 5000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(review)

In [324]:
len(tokenizer.word_index)

34458

In [327]:
# Tokenize the text data for training
review_train_tokens = tokenizer.texts_to_sequences(review_train)
review_test_tokens = tokenizer.texts_to_sequences(review_test)

In [329]:
len(review_train[1699].split())

1988

In [331]:
len(review_train_tokens[1699])

1776

In [333]:
num_tokens = [len(tokens) for tokens in review_train_tokens + review_test_tokens]
num_tokens = np.array(num_tokens)

In [335]:
np.mean(num_tokens)

83.52325

In [337]:
np.max(num_tokens)

1776

In [339]:
np.argmax(num_tokens)

1699

In [341]:
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

256

In [343]:
np.sum(num_tokens < max_tokens) / len(num_tokens)

0.96185

In [345]:
review_train_pad = pad_sequences(review_train_tokens, maxlen=max_tokens)
review_test_pad = pad_sequences(review_test_tokens, maxlen=max_tokens)

In [297]:
np.array(review_train_tokens[800])

array([   1,  259,   47,  596,   26,    6,  475,   27,   19,   96,   11,
       2666,   65,   27,   24,   47,   17,   13,    5,    7,   81,    6,
        655,   18,    1, 2010])

In [None]:
review_train_pad[800]

In [None]:
review_train_pad.shape

In [None]:
review_test_pad.shape

In [347]:
# Tokenize the text data for training
index = tokenizer.word_index
index_word = dict([(value, key) for (key, value) in index.items()])

In [349]:
# Define a function to convert tokenized text back to human-readable text
def decode_review(tokens):
    return ' '.join([index_word.get(i, '?') for i in tokens])

In [351]:
review_train[800]

'the people we sent these to liked them but did not rave about them. so we are concluding that it is best to stick with the cashews'

In [353]:
# Define a function to convert tokenized text back to human-readable text
decode_review(review_train_tokens[800])

'the people we sent these to liked them but did not rave about them so we are that it is best to stick with the cashews'

In [153]:
# Load GloVe embeddings into a dictionary
glove_embeddings = {}
with open('glove.6B/glove.6B.100d.txt', "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype="float32")
        glove_embeddings[word] = vector
print("GloVe embeddings loaded!")

GloVe embeddings loaded!


In [355]:

glove_vocab = set(glove_embeddings.keys())

# num_words sınırına göre ilk kelimeleri kontrol et
missing_words = [
    word for word, index in tokenizer.word_index.items()
    if index < num_words and word not in glove_vocab
]

# Eksik kelimeleri görüntüle
print(f"Total words in vocab: {num_words}")
print(f"Missing words in the first {num_words}: {len(missing_words)}")
print("Missing words examples:", missing_words[:10])  # İlk 10 eksik kelime


Total words in vocab: 5000
Missing words in the first 5000: 72
Missing words examples: ["amazon's", "bob's", "dog's", "newman's", "joe's", 'eacute', "nature's", "sam's", "company's", "earth's"]


In [357]:
# Initialize embedding matrix
embedding_dim = 100
vocab_size = num_words + 1  # `word_index` should be defined earlier during tokenization
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# UNK token için rastgele embedding başlatma
unk_embedding = np.random.uniform(-0.25, 0.25, embedding_dim)

In [359]:
vocab_size

5001

In [361]:
# Fill the embedding matrix
print("Creating embedding matrix...")
missing_words = 0
for word, idx in tokenizer.word_index.items():  # use word_index from tokenizer
    if idx >= vocab_size:
        continue
    embedding_vector = glove_embeddings.get(word)
    
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector  # Words found in GloVe
    else:
        embedding_matrix[idx] = unk_embedding  # Words not found in GloVe, use UNK embedding
        missing_words += 1  # Words not found in GloVe

print(f"Embedding matrix created! Missing words: {missing_words}")

Creating embedding matrix...
Embedding matrix created! Missing words: 72


In [363]:
embedding_matrix.shape

(5001, 100)

In [185]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D

In [393]:
# Define the model
model = Sequential()

# Embedding layer with pre-trained GloVe embeddings
model.add(Embedding(
    input_dim=vocab_size,  # Vocabulary size (including padding token)
    output_dim=100,  # GloVe embedding dimension
    weights=[embedding_matrix],  # Pre-trained embedding matrix
    input_length=max_tokens,  # Maximum sequence length
    trainable=True  # Freeze embeddings
))

# SpatialDropout1D for regularization
#model.add(SpatialDropout1D(0.2))

# First LSTM layer
#model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))

# First LSTM layer
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))

# Second LSTM layer
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=False))

# Dense layer for further feature extraction
#model.add(Dense(64, activation='relu'))
#model.add(Dropout(0.5))

# Output layer for sentiment classification
model.add(Dense(1, activation='sigmoid'))  # Sigmoid for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

rating_train = np.array(rating_train)

# Train the model
history = model.fit(
    review_train_pad,
    rating_train,
    batch_size=16,
    epochs=5,
    validation_split=0.2,
    verbose=1
)

# Evaluate the model
#loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
#print(f"Test Loss: {loss}")
#print(f"Test Accuracy: {accuracy}")




Epoch 1/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 91ms/step - accuracy: 0.8438 - loss: 0.4047 - val_accuracy: 0.8778 - val_loss: 0.2736
Epoch 2/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 87ms/step - accuracy: 0.9100 - loss: 0.2144 - val_accuracy: 0.9091 - val_loss: 0.2202
Epoch 3/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 85ms/step - accuracy: 0.9484 - loss: 0.1388 - val_accuracy: 0.9003 - val_loss: 0.2411
Epoch 4/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 85ms/step - accuracy: 0.9604 - loss: 0.1096 - val_accuracy: 0.9169 - val_loss: 0.2308
Epoch 5/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 85ms/step - accuracy: 0.9734 - loss: 0.0807 - val_accuracy: 0.9109 - val_loss: 0.2696


In [193]:
rating_test = np.array(rating_test)

In [409]:
# Evaluate the model on the test dataset
evaluation_result = model.evaluate(review_test_pad, rating_test, verbose=1)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 50ms/step - accuracy: 0.9313 - loss: 0.2239


In [411]:
# Print results
print(f"Test Loss: {evaluation_result[0]:.4f}")
print(f"Test Accuracy: {evaluation_result[1]:.4f}")

Test Loss: 0.2045
Test Accuracy: 0.9352


In [405]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Evaluate the model on the test set
test_predictions = model.predict(review_test_pad, verbose=1)
test_predictions = (test_predictions > 0.5).astype(int)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 50ms/step


In [413]:
# Calculate metrics
accuracy = accuracy_score(rating_test, test_predictions)
precision = precision_score(rating_test, test_predictions)
recall = recall_score(rating_test, test_predictions)
f1 = f1_score(rating_test, test_predictions)

# Print the metrics
print("Test Set Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(rating_test, test_predictions))


Test Set Performance:
Accuracy: 0.9353
Precision: 0.9513
Recall: 0.9744
F1-Score: 0.9627

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.70      0.75       569
           1       0.95      0.97      0.96      3431

    accuracy                           0.94      4000
   macro avg       0.89      0.84      0.86      4000
weighted avg       0.93      0.94      0.93      4000



In [395]:
# Example test sentences in English
test_sentences = [
    "The product is absolutely amazing, much better than I expected!",
    "My order arrived very quickly, and the packaging was excellent. Thank you!",
    "I'm very satisfied with this service, and I will definitely use it again.",
    "The food was delicious and reasonably priced for its quality.",
    "The camera on this phone is outstanding, even in low light conditions.",
    "Unfortunately, the product arrived damaged, and I couldn't use it.",
    "The delivery was extremely late, and customer service was unhelpful.",
    "The quality is very poor, and it's not worth the price at all.",
    "The restaurant was filthy, and the service was very slow.",
    "The product I received was completely different from what I ordered."
]


# Tokenize ve pad etme
test_tokens = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_tokens, maxlen=max_tokens)

# Modelle tahmin yapma
predictions = model.predict(test_padded)

# Tahmin edilen sınıfları çıkarma (0 veya 1)
predicted_classes = (predictions > 0.5).astype(int).flatten()

# Sonuçları yazdırma
for i, sentence in enumerate(test_sentences):
    print(f"Cümle: {sentence}")
    print(f"Tahmin edilen sınıf: {'Pozitif' if predicted_classes[i] == 1 else 'Negatif'}")
    print(f"Model çıktısı (olasılık): {predictions[i][0]:.4f}")
    print("-" * 50)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 185ms/step
Cümle: The product is absolutely amazing, much better than I expected!
Tahmin edilen sınıf: Pozitif
Model çıktısı (olasılık): 0.9897
--------------------------------------------------
Cümle: My order arrived very quickly, and the packaging was excellent. Thank you!
Tahmin edilen sınıf: Pozitif
Model çıktısı (olasılık): 0.9996
--------------------------------------------------
Cümle: I'm very satisfied with this service, and I will definitely use it again.
Tahmin edilen sınıf: Pozitif
Model çıktısı (olasılık): 0.9984
--------------------------------------------------
Cümle: The food was delicious and reasonably priced for its quality.
Tahmin edilen sınıf: Pozitif
Model çıktısı (olasılık): 0.9982
--------------------------------------------------
Cümle: The camera on this phone is outstanding, even in low light conditions.
Tahmin edilen sınıf: Pozitif
Model çıktısı (olasılık): 0.9985
--------------------------------