In [4]:
# Step 1: Import Libraries and Load Data
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import emoji
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load the dataset
file_path = '1429_1.csv'
df = pd.read_csv(file_path)

# Step 2: Basic Data Exploration
print("Basic Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())
print("\nFirst few rows of the dataset:")
print(df.head())

# Step 3: Filter Relevant Columns
df_filtered = df[['reviews.text', 'reviews.rating']]
df_filtered = df_filtered.dropna(subset=['reviews.text', 'reviews.rating'])

# Step 4: Download Necessary NLTK Resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Step 5: Define Text Cleaning Functions
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^A-Za-z\s]', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

df_filtered['cleaned_text'] = df_filtered['reviews.text'].apply(clean_text)

# Step 6: Additional Feature Engineering
df_filtered['word_count'] = df_filtered['cleaned_text'].apply(lambda x: len(x.split()))
df_filtered['avg_word_length'] = df_filtered['cleaned_text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()) if x.split() else 0)

# Define a function to categorize ratings into sentiment
def categorize_rating(rating):
    if rating in [1, 2, 3]:
        return 'Negative'
    elif rating == 4:
        return 'Neutral'
    elif rating == 5:
        return 'Positive'

df_filtered['sentiment'] = df_filtered['reviews.rating'].apply(categorize_rating)

# Map the sentiment labels to numerical values
df_filtered['sentiment_label'] = df_filtered['sentiment'].map({'Negative': 0, 'Neutral': 1, 'Positive': 2})

# Step 7: TF-IDF Feature Engineering
tfidf = TfidfVectorizer(max_features=500)
tfidf_matrix = tfidf.fit_transform(df_filtered['cleaned_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

# Concatenate TF-IDF features with the DataFrame
df_filtered = pd.concat([df_filtered.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# Prepare the feature set
X = df_filtered.drop(columns=['reviews.text', 'reviews.rating', 'sentiment', 'cleaned_text', 'sentiment_label'])
y = df_filtered['sentiment_label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: SVM Model Training and Evaluation
# Initialize and train the SVM model with a linear kernel
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Display results
print("Test Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", conf_matrix)

# Save the final processed DataFrame
df_filtered.to_csv('processed_reviews.csv', index=False)

Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20383 entries, 0 to 20382
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    20383 non-null  object 
 1   name                  20383 non-null  object 
 2   asins                 20381 non-null  object 
 3   brand                 20383 non-null  object 
 4   categories            20383 non-null  object 
 5   keys                  20383 non-null  object 
 6   manufacturer          20383 non-null  object 
 7   reviews.date          20370 non-null  object 
 8   reviews.dateAdded     18691 non-null  object 
 9   reviews.dateSeen      20383 non-null  object 
 10  reviews.didPurchase   1 non-null      object 
 11  reviews.doRecommend   19939 non-null  object 
 12  reviews.id            1 non-null      float64
 13  reviews.numHelpful    19953 non-null  float64
 14  reviews.rating        20355 non-null  float64
 15  reviews

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Test Accuracy: 0.6814050601817735
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.26      0.37       308
           1       0.53      0.12      0.20      1129
           2       0.69      0.97      0.81      2634

    accuracy                           0.68      4071
   macro avg       0.62      0.45      0.46      4071
weighted avg       0.64      0.68      0.61      4071

Confusion Matrix:
 [[  81   55  172]
 [  33  141  955]
 [  11   71 2552]]


In [2]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.0


In [6]:
# Step 1: Import Libraries and Load Data
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import emoji
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Load the dataset
file_path = '1429_1.csv'
df = pd.read_csv(file_path)

# Step 2: Basic Data Exploration
print("Basic Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())
print("\nFirst few rows of the dataset:")
print(df.head())

# Step 3: Filter Relevant Columns
df_filtered = df[['reviews.text', 'reviews.rating']]
df_filtered = df_filtered.dropna(subset=['reviews.text', 'reviews.rating'])

# Step 4: Download Necessary NLTK Resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Step 5: Define Text Cleaning Functions
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^A-Za-z\s]', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

df_filtered['cleaned_text'] = df_filtered['reviews.text'].apply(clean_text)

# Step 6: Additional Feature Engineering
df_filtered['word_count'] = df_filtered['cleaned_text'].apply(lambda x: len(x.split()))
df_filtered['avg_word_length'] = df_filtered['cleaned_text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()) if x.split() else 0)

# Define a function to categorize ratings into sentiment
def categorize_rating(rating):
    if rating in [1, 2, 3]:
        return 'Negative'
    elif rating == 4:
        return 'Neutral'
    elif rating == 5:
        return 'Positive'

df_filtered['sentiment'] = df_filtered['reviews.rating'].apply(categorize_rating)

# Map the sentiment labels to numerical values
df_filtered['sentiment_label'] = df_filtered['sentiment'].map({'Negative': 0, 'Neutral': 1, 'Positive': 2})

# Step 7: TF-IDF Feature Engineering
tfidf = TfidfVectorizer(max_features=500)
tfidf_matrix = tfidf.fit_transform(df_filtered['cleaned_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

# Concatenate TF-IDF features with the DataFrame
df_filtered = pd.concat([df_filtered.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# Prepare the feature set
X = df_filtered.drop(columns=['reviews.text', 'reviews.rating', 'sentiment', 'cleaned_text', 'sentiment_label'])
y = df_filtered['sentiment_label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Neural Network Model Training and Evaluation
# Define a simple feedforward neural network
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(3, activation='softmax'))  # Output layer with 3 classes

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy}")

# Predictions
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)

# Evaluate the model performance
classification_rep = classification_report(y_test, y_pred_classes)
conf_matrix = confusion_matrix(y_test, y_pred_classes)

# Display results
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", conf_matrix)

# Save the final processed DataFrame
df_filtered.to_csv('processed_reviews.csv', index=False)

  df = pd.read_csv(file_path)


Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34660 entries, 0 to 34659
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    34660 non-null  object 
 1   name                  27900 non-null  object 
 2   asins                 34658 non-null  object 
 3   brand                 34660 non-null  object 
 4   categories            34660 non-null  object 
 5   keys                  34660 non-null  object 
 6   manufacturer          34660 non-null  object 
 7   reviews.date          34621 non-null  object 
 8   reviews.dateAdded     24039 non-null  object 
 9   reviews.dateSeen      34660 non-null  object 
 10  reviews.didPurchase   1 non-null      object 
 11  reviews.doRecommend   34066 non-null  object 
 12  reviews.id            1 non-null      float64
 13  reviews.numHelpful    34131 non-null  float64
 14  reviews.rating        34627 non-null  float64
 15  reviews

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.6724 - loss: 0.8268 - val_accuracy: 0.6952 - val_loss: 0.6879
Epoch 2/10
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.6980 - loss: 0.6998 - val_accuracy: 0.7029 - val_loss: 0.6672
Epoch 3/10
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.7110 - loss: 0.6716 - val_accuracy: 0.7078 - val_loss: 0.6630
Epoch 4/10
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7200 - loss: 0.6564 - val_accuracy: 0.7108 - val_loss: 0.6565
Epoch 5/10
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7217 - loss: 0.6457 - val_accuracy: 0.7107 - val_loss: 0.6597
Epoch 6/10
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7317 - loss: 0.6309 - val_accuracy: 0.7157 - val_loss: 0.6559
Epoch 7/10
[1m866/866[0m 

In [23]:
# Step 1: Import Libraries and Load Data
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import emoji
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# Load the dataset
file_path = '1429_1.csv'
df = pd.read_csv(file_path)

# Step 2: Basic Data Exploration
print("Basic Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())
print("\nFirst few rows of the dataset:")
print(df.head())

# Step 3: Filter Relevant Columns
df_filtered = df[['reviews.text', 'reviews.rating']]
df_filtered = df_filtered.dropna(subset=['reviews.text', 'reviews.rating'])

# Step 4: Download Necessary NLTK Resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Step 5: Define Text Cleaning Functions
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^A-Za-z\s]', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

df_filtered['cleaned_text'] = df_filtered['reviews.text'].apply(clean_text)

# Step 6: Additional Feature Engineering
df_filtered['word_count'] = df_filtered['cleaned_text'].apply(lambda x: len(x.split()))
df_filtered['avg_word_length'] = df_filtered['cleaned_text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()) if x.split() else 0)

# Define a function to categorize ratings into sentiment
def categorize_rating(rating):
    if rating in [1, 2, 3]:
        return 'Negative'
    elif rating == 4:
        return 'Neutral'
    elif rating == 5:
        return 'Positive'

df_filtered['sentiment'] = df_filtered['reviews.rating'].apply(categorize_rating)

# Map the sentiment labels to numerical values
df_filtered['sentiment_label'] = df_filtered['sentiment'].map({'Negative': 0, 'Neutral': 1, 'Positive': 2})

# Step 7: TF-IDF Feature Engineering
tfidf = TfidfVectorizer(max_features=500)
tfidf_matrix = tfidf.fit_transform(df_filtered['cleaned_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

# Concatenate TF-IDF features with the DataFrame
df_filtered = pd.concat([df_filtered.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# Prepare the feature set
X = df_filtered.drop(columns=['reviews.text', 'reviews.rating', 'sentiment', 'cleaned_text', 'sentiment_label'])
y = df_filtered['sentiment_label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Neural Network Model Training and Evaluation

model = Sequential()

# Input layer and first hidden layer
model.add(Dense(128, activation='selu', input_shape=(X_train.shape[1],)))
model.add(BatchNormalization())
model.add(Dropout(0.2))

# Second hidden layer
model.add(Dense(64, activation='selu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))

# Third hidden layer
model.add(Dense(32, activation='selu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))

# Output layer with 3 classes using softmax activation for multi-class classification
model.add(Dense(3, activation='softmax'))

# Compile the model with an appropriate optimizer and loss function
model.compile(optimizer=Adam(learning_rate=0.00001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy}")

# Predictions
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)

# Evaluate the model performance
classification_rep = classification_report(y_test, y_pred_classes)
conf_matrix = confusion_matrix(y_test, y_pred_classes)

# Display results
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", conf_matrix)

# Save the final processed DataFrame
df_filtered.to_csv('processed_reviews.csv', index=False)

  df = pd.read_csv(file_path)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34660 entries, 0 to 34659
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    34660 non-null  object 
 1   name                  27900 non-null  object 
 2   asins                 34658 non-null  object 
 3   brand                 34660 non-null  object 
 4   categories            34660 non-null  object 
 5   keys                  34660 non-null  object 
 6   manufacturer          34660 non-null  object 
 7   reviews.date          34621 non-null  object 
 8   reviews.dateAdded     24039 non-null  object 
 9   reviews.dateSeen      34660 non-null  object 
 10  reviews.didPurchase   1 non-null      object 
 11  reviews.doRecommend   34066 non-null  object 
 12  reviews.id            1 non-null      float64
 13  reviews.numHelpful    34131 non-null  float64
 14  reviews.rating        34627 non-null  float64
 15  reviews

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.3725 - loss: 1.3926 - val_accuracy: 0.6751 - val_loss: 0.8778
Epoch 2/100
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.5342 - loss: 1.0314 - val_accuracy: 0.6826 - val_loss: 0.7768
Epoch 3/100
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.6190 - loss: 0.8900 - val_accuracy: 0.6884 - val_loss: 0.7381
Epoch 4/100
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.6466 - loss: 0.8335 - val_accuracy: 0.6929 - val_loss: 0.7073
Epoch 5/100
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.6622 - loss: 0.7978 - val_accuracy: 0.6985 - val_loss: 0.6894
Epoch 6/100
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.6773 - loss: 0.7651 - val_accuracy: 0.7076 - val_loss: 0.6786
Epoch 7/100
[1m866/86

In [28]:
# Step 1: Import Libraries and Load Data
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import emoji
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Load the dataset
file_path = '1429_1.csv'
df = pd.read_csv(file_path)

# Step 2: Basic Data Exploration
print("Basic Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())
print("\nFirst few rows of the dataset:")
print(df.head())

# Step 3: Filter Relevant Columns
df_filtered = df[['reviews.text', 'reviews.rating']]
df_filtered = df_filtered.dropna(subset=['reviews.text', 'reviews.rating'])

# Step 4: Download Necessary NLTK Resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Step 5: Define Text Cleaning Functions
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^A-Za-z\s]', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

df_filtered['cleaned_text'] = df_filtered['reviews.text'].apply(clean_text)

# Step 6: Additional Feature Engineering
df_filtered['word_count'] = df_filtered['cleaned_text'].apply(lambda x: len(x.split()))
df_filtered['avg_word_length'] = df_filtered['cleaned_text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()) if x.split() else 0)

# Define a function to categorize ratings into sentiment
def categorize_rating(rating):
    if rating in [1, 2, 3]:
        return 'Negative'
    elif rating == 4:
        return 'Neutral'
    elif rating == 5:
        return 'Positive'

df_filtered['sentiment'] = df_filtered['reviews.rating'].apply(categorize_rating)

# Map the sentiment labels to numerical values
df_filtered['sentiment_label'] = df_filtered['sentiment'].map({'Negative': 0, 'Neutral': 1, 'Positive': 2})

# Step 7: TF-IDF Feature Engineering
tfidf = TfidfVectorizer(max_features=500)
tfidf_matrix = tfidf.fit_transform(df_filtered['cleaned_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

# Concatenate TF-IDF features with the DataFrame
df_filtered = pd.concat([df_filtered.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# Prepare the feature set
X = df_filtered.drop(columns=['reviews.text', 'reviews.rating', 'sentiment', 'cleaned_text', 'sentiment_label'])
y = df_filtered['sentiment_label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Neural Network Model Training and Evaluation
# Define a simple feedforward neural network
model = Sequential()
model.add(Dense(64, activation='selu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.3))
model.add(Dense(64, activation='selu'))
model.add(Dropout(0.3))
model.add(Dense(3, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.00005), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy}")

# Predictions
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)

# Evaluate the model performance
classification_rep = classification_report(y_test, y_pred_classes)
conf_matrix = confusion_matrix(y_test, y_pred_classes)

# Display results
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", conf_matrix)

# Save the final processed DataFrame
df_filtered.to_csv('processed_reviews.csv', index=False)

  df = pd.read_csv(file_path)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34660 entries, 0 to 34659
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    34660 non-null  object 
 1   name                  27900 non-null  object 
 2   asins                 34658 non-null  object 
 3   brand                 34660 non-null  object 
 4   categories            34660 non-null  object 
 5   keys                  34660 non-null  object 
 6   manufacturer          34660 non-null  object 
 7   reviews.date          34621 non-null  object 
 8   reviews.dateAdded     24039 non-null  object 
 9   reviews.dateSeen      34660 non-null  object 
 10  reviews.didPurchase   1 non-null      object 
 11  reviews.doRecommend   34066 non-null  object 
 12  reviews.id            1 non-null      float64
 13  reviews.numHelpful    34131 non-null  float64
 14  reviews.rating        34627 non-null  float64
 15  reviews

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.4901 - loss: 1.3340 - val_accuracy: 0.6809 - val_loss: 0.7963
Epoch 2/50
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6327 - loss: 0.9686 - val_accuracy: 0.6805 - val_loss: 0.7737
Epoch 3/50
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6446 - loss: 0.9054 - val_accuracy: 0.6818 - val_loss: 0.7683
Epoch 4/50
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6677 - loss: 0.8484 - val_accuracy: 0.6813 - val_loss: 0.7534
Epoch 5/50
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.6674 - loss: 0.8277 - val_accuracy: 0.6818 - val_loss: 0.7462
Epoch 6/50
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6772 - loss: 0.7889 - val_accuracy: 0.6815 - val_loss: 0.7316
Epoch 7/50
[1m866/866[0m 

In [31]:
# Step 1: Import Libraries and Load Data
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import emoji
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Load the dataset
file_path = '1429_1.csv'
df = pd.read_csv(file_path)

# Step 2: Basic Data Exploration
print("Basic Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())
print("\nFirst few rows of the dataset:")
print(df.head())

# Step 3: Filter Relevant Columns
df_filtered = df[['reviews.text', 'reviews.rating']]
df_filtered = df_filtered.dropna(subset=['reviews.text', 'reviews.rating'])

# Step 4: Download Necessary NLTK Resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Step 5: Define Text Cleaning Functions
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^A-Za-z\s]', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

df_filtered['cleaned_text'] = df_filtered['reviews.text'].apply(clean_text)

# Step 6: Additional Feature Engineering
df_filtered['word_count'] = df_filtered['cleaned_text'].apply(lambda x: len(x.split()))
df_filtered['avg_word_length'] = df_filtered['cleaned_text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()) if x.split() else 0)

# Define a function to categorize ratings into sentiment
def categorize_rating(rating):
    if rating in [1, 2, 3]:
        return 'Negative'
    elif rating == 4:
        return 'Neutral'
    elif rating == 5:
        return 'Positive'

df_filtered['sentiment'] = df_filtered['reviews.rating'].apply(categorize_rating)

# Map the sentiment labels to numerical values
df_filtered['sentiment_label'] = df_filtered['sentiment'].map({'Negative': 0, 'Neutral': 1, 'Positive': 2})

# Step 7: TF-IDF Feature Engineering
tfidf = TfidfVectorizer(max_features=500)
tfidf_matrix = tfidf.fit_transform(df_filtered['cleaned_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

# Concatenate TF-IDF features with the DataFrame
df_filtered = pd.concat([df_filtered.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# Prepare the feature set
X = df_filtered.drop(columns=['reviews.text', 'reviews.rating', 'sentiment', 'cleaned_text', 'sentiment_label'])
y = df_filtered['sentiment_label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Neural Network Model Training and Evaluation
# Define a simple feedforward neural network
model = Sequential()
model.add(Dense(64, activation='selu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.3))
model.add(Dense(64, activation='selu'))
model.add(Dropout(0.3))
model.add(Dense(3, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.00005), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=200, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy}")

# Predictions
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)

# Evaluate the model performance
classification_rep = classification_report(y_test, y_pred_classes)
conf_matrix = confusion_matrix(y_test, y_pred_classes)

# Display results
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", conf_matrix)

# Save the final processed DataFrame
df_filtered.to_csv('processed_reviews.csv', index=False)

  df = pd.read_csv(file_path)


Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34660 entries, 0 to 34659
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    34660 non-null  object 
 1   name                  27900 non-null  object 
 2   asins                 34658 non-null  object 
 3   brand                 34660 non-null  object 
 4   categories            34660 non-null  object 
 5   keys                  34660 non-null  object 
 6   manufacturer          34660 non-null  object 
 7   reviews.date          34621 non-null  object 
 8   reviews.dateAdded     24039 non-null  object 
 9   reviews.dateSeen      34660 non-null  object 
 10  reviews.didPurchase   1 non-null      object 
 11  reviews.doRecommend   34066 non-null  object 
 12  reviews.id            1 non-null      float64
 13  reviews.numHelpful    34131 non-null  float64
 14  reviews.rating        34627 non-null  float64
 15  reviews

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/200
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5364 - loss: 1.2860 - val_accuracy: 0.6837 - val_loss: 0.8045
Epoch 2/200
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6218 - loss: 1.0012 - val_accuracy: 0.6837 - val_loss: 0.7805
Epoch 3/200
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6457 - loss: 0.9193 - val_accuracy: 0.6837 - val_loss: 0.7664
Epoch 4/200
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.6555 - loss: 0.8669 - val_accuracy: 0.6837 - val_loss: 0.7567
Epoch 5/200
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.6686 - loss: 0.8265 - val_accuracy: 0.6837 - val_loss: 0.7439
Epoch 6/200
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.6750 - loss: 0.8067 - val_accuracy: 0.6841 - val_loss: 0.7252
Epoch 7/200
[1m866/86