In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd

data = pd.read_csv("1429_1.csv")

print(data.info())  # Check columns and data types
print(data.head())  # View the first few rows

In [None]:
#Data Cleaning
# Step 1: Retain Relevant Columns
relevant_columns = ['reviews.text', 'reviews.rating', 'categories', 'brand', 'name']
data = data[relevant_columns]

In [None]:
# Step 2: Handle Missing Values
# Drop rows where critical fields (`reviews.text` and `reviews.rating`) are missing
data = data.dropna(subset=['reviews.text', 'reviews.rating'])

In [None]:
# Step 3: Standardize Text
# Convert text to lowercase and remove special characters and extra spaces
data['reviews.text'] = data['reviews.text'].str.lower()
data['reviews.text'] = data['reviews.text'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
data['reviews.text'] = data['reviews.text'].str.strip()

In [None]:
# Step 4: Deduplicate Entries
# Remove duplicate rows based on all columns
data = data.drop_duplicates()

In [None]:
# Step 5: Validate Ratings
# Ensure `reviews.rating` contains valid star ratings (e.g., 1-5)
data = data[data['reviews.rating'].isin([1, 2, 3, 4, 5])]

In [None]:
# Reset the index after cleaning
data = data.reset_index(drop=True)

In [None]:
# Save the cleaned data to a new CSV file
data.to_csv('cleaned_reviews.csv', index=False)
print("Cleaned dataset saved.")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv("cleaned_reviews.csv")

In [None]:
#Preprocessing for Modeling
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from imblearn.over_sampling import SMOTE

data = pd.read_csv("cleaned_reviews.csv")

In [None]:
# Step 1: Address Class Imbalance
# Separate features and target
X = data['reviews.text']
y = data['reviews.rating']

# Map ratings to sentiment categories (positive, neutral, negative)
def map_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

y = y.map(map_sentiment)

# Convert sentiment labels to numeric encoding
sentiment_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
y = y.map(sentiment_mapping)

In [None]:
# Step 2: Text Vectorization

# Replace NaN values in 'reviews.text' with an empty string
X = X.fillna("")

# Remove rows where the review text is empty after cleaning
non_empty_indices = X.str.strip() != ""
X = X[non_empty_indices]
y = y[non_empty_indices]


# Use TF-IDF Vectorizer for converting text to numerical features
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf = tfidf.fit_transform(X)

# Save the TF-IDF vectorizer
tfidf_vectorizer_path = "tfidf_vectorizer.pkl"
joblib.dump(tfidf, tfidf_vectorizer_path)
print(f"TF-IDF vectorizer saved to: {tfidf_vectorizer_path}")

In [None]:
# Step 3: Handle Class Imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

In [None]:
# Step 4: Split Data into Train and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
# Step 5: Save Preprocessed Data
train_test_data_path = 'preprocessed_train_test_data.pkl'
pd.to_pickle((X_train, X_test, y_train, y_test), train_test_data_path)

print(f"Preprocessed data saved to: {train_test_data_path}")

In [None]:
#Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

X_train, X_test, y_train, y_test = pd.read_pickle(train_test_data_path)

In [None]:
# Step 1: Train Logistic Regression Model
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_train, y_train)

In [None]:
# Step 2: Train Random Forest Model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

In [None]:
# Step 3: Train Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [None]:
# Step 4: Evaluate Models
# Logistic Regression Evaluation
y_pred_logistic = logistic_model.predict(X_test)
print("Logistic Regression Report:")
print(classification_report(y_test, y_pred_logistic))
print("Accuracy:", accuracy_score(y_test, y_pred_logistic))

# Random Forest Evaluation
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Report:")
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))

# Naive Bayes Evaluation
y_pred_nb = nb_model.predict(X_test)
print("Naive Bayes Report:")
print(classification_report(y_test, y_pred_nb))
print("Accuracy:", accuracy_score(y_test, y_pred_nb))

In [None]:
# Step 5: Confusion Matrix Visualization
def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Neutral', 'Positive'], yticklabels=['Negative', 'Neutral', 'Positive'])
    plt.title(f"Confusion Matrix: {model_name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

# Plot confusion matrices for all models
plot_confusion_matrix(y_test, y_pred_logistic, "Logistic Regression")
plot_confusion_matrix(y_test, y_pred_rf, "Random Forest")
plot_confusion_matrix(y_test, y_pred_nb, "Naive Bayes")

In [None]:
#saving the original
import joblib

# Save the original Random Forest model
model_path = 'original_rf_model.pkl'
joblib.dump(rf_model, model_path)
print(f"Model saved to {model_path}")


**LSTM**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load cleaned dataset
data = pd.read_csv("cleaned_reviews.csv")

In [None]:
# Step 1: Load and Preprocess the Dataset
# Map ratings to sentiment categories
def map_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

X = data['reviews.text'].fillna("")
y = data['reviews.rating'].map(map_sentiment).map({'positive': 2, 'neutral': 1, 'negative': 0})

# Remove empty reviews
non_empty_indices = X.str.strip() != ""
X = X[non_empty_indices]
y = y[non_empty_indices]

In [None]:
# Step 2: Tokenize and Pad Sequences
vocab_size = 10000  # Limit vocabulary size
max_length = 100    # Limit sequence length

# Tokenizer setup
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X)

# Convert text to sequences
X_sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to ensure uniform length
X_padded = pad_sequences(X_sequences, maxlen=max_length, padding='post', truncating='post')

In [None]:
# Step 3: Address Class Imbalance with SMOTE
smote = SMOTE(random_state=42)

# SMOTE requires reshaping padded sequences to 2D for processing
X_padded_reshaped = X_padded.reshape(X_padded.shape[0], -1)
X_resampled, y_resampled = smote.fit_resample(X_padded_reshaped, y)

# Reshape back to original padded format
X_resampled = X_resampled.reshape(-1, max_length)

In [None]:
# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
# Step 5: Save Preprocessed Data
train_test_data_path = 'preprocessed_train_test_data_lstm.pkl'
pd.to_pickle((X_train, X_test, y_train, y_test), train_test_data_path)

print(f"Preprocessed data saved to: {train_test_data_path}")

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Load Preprocessed Data
train_test_data_path = 'preprocessed_train_test_data_lstm.pkl'
X_train, X_test, y_train, y_test = pd.read_pickle(train_test_data_path)

In [None]:
# Step 2: Build the LSTM Model
vocab_size = 10000  # Ensure this matches the tokenizer vocab_size
max_length = 100    # Ensure this matches the sequence length used in preprocessing

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(32)),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')  # 3 classes: negative, neutral, positive
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# Step 3: Train the Model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=2
)

In [None]:
# Step 4: Evaluate the Model
y_pred = model.predict(X_test).argmax(axis=1)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))

# Confusion Matrix Visualization
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Neutral', 'Positive'], yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
#attemt of Improve 3
from tensorflow.keras.optimizers import Adam
# Step 1: Load Preprocessed Data
train_test_data_path = 'preprocessed_train_test_data_lstm.pkl'
X_train, X_test, y_train, y_test = pd.read_pickle(train_test_data_path)

# Step 2: Build the Enhanced LSTM Model with Lower Learning Rate
vocab_size = 10000
max_length = 150
embedding_dim = 128

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(3, activation='softmax')
])

# Compile with lower learning rate
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam(learning_rate=0.0005),  # Reduced learning rate
    metrics=['accuracy']
)
model.summary()

In [None]:
print(y_train.unique())  # Check the unique labels in training data


In [None]:
# Step 3: Train the Model with Class Weights
class_weights = {0: 2.0, 1: 2.0, 2: 1.0}  # Higher weights for Negative and Neutral classes

# Convert Data to NumPy Arrays
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    batch_size=32,
    class_weight=class_weights,  # Apply class weights here
    callbacks=[early_stopping],
    verbose=2
)

In [None]:
# Step 4: Evaluate the Model
y_pred = model.predict(X_test).argmax(axis=1)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))

# Confusion Matrix Visualization
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Neutral', 'Positive'], yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Save the trained model to an HDF5 file
model.save("lstm_sentiment_model.h5")
print("Model saved as lstm_sentiment_model.h5")

In [None]:
!pip install transformers datasets

**Transformers**

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

# Step 1: Load Preprocessed Data
data_path = "cleaned_reviews.csv"
data = pd.read_csv(data_path)

# Map ratings to sentiment categories
def map_sentiment(rating):
    if rating >= 4:
        return 2  # Positive
    elif rating == 3:
        return 1  # Neutral
    else:
        return 0  # Negative

data["labels"] = data["reviews.rating"].apply(map_sentiment)
data = data[["reviews.text", "labels"]].dropna()

# Train-test split
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["reviews.text"].tolist(), data["labels"].tolist(), test_size=0.2, random_state=42
)

# Create a Dataset object
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
test_dataset = Dataset.from_dict({"text": test_texts, "label": test_labels})
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})



In [None]:
pd.DataFrame(test_dataset)['text'].values.tolist()[0]

In [None]:
# Step 2: Tokenize the Data
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

encoded_dataset = dataset.map(preprocess_function, batched=True)



In [None]:
# Step 3: Load Pre-Trained Transformer Model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)



In [None]:
# Step 4: Fine-Tune the Model
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

# Step 5: Evaluate the Model
metrics = trainer.evaluate()
print(metrics)

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
print("Model and tokenizer saved to ./fine_tuned_model")

**Practice for web page**

In [None]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Load the Random Forest model
model_path = "original_rf_model.pkl"
rf_model = joblib.load(model_path)
print("Random Forest model loaded successfully.")

# Step 2: Load the new CSV file
csv_file = "/content/cleaned_reviews.csv"
data = pd.read_csv(csv_file)

# Assuming the review text is in a column named 'reviews.text'
if 'reviews.text' not in data.columns:
    raise ValueError("The CSV file must have a 'reviews.text' column for the reviews.")

# Step 3: Preprocess the review text
# Load the TF-IDF vectorizer used during training
tfidf_path = "/content/tfidf_vectorizer.pkl"  # Replace with the path to your saved TF-IDF vectorizer
tfidf = joblib.load(tfidf_path)
print("TF-IDF vectorizer loaded successfully.")

# Transform the review text into numerical features
X_new = tfidf.transform(data['reviews.text'].fillna(""))

# Step 4: Make Predictions
predictions = rf_model.predict(X_new)

# Step 5: Map predictions back to sentiment labels (if needed)
sentiment_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}
predicted_labels = [sentiment_mapping[label] for label in predictions]

# Add the predictions to the original DataFrame
data['Predicted Sentiment'] = predicted_labels

# Display the updated DataFrame
print(data[['reviews.text', 'Predicted Sentiment']])


**Part 2 of Project**

In [None]:
pip install transformers --upgrade

In [None]:
!pip uninstall -y torch torchvision torchaudio transformers
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import torch
from transformers import pipeline

# Step 1: Load the Dataset
data = pd.read_csv("cleaned_reviews.csv")

# Ensure necessary columns exist
if not {'categories', 'reviews.rating', 'reviews.text'}.issubset(data.columns):
    raise ValueError("The dataset must contain 'categories', 'reviews.rating', and 'reviews.text' columns.")

In [None]:
# Step 2: Preprocessing
# Fill missing values and filter non-empty reviews
data = data.dropna(subset=['categories', 'reviews.rating', 'reviews.text'])
data = data[data['reviews.text'].str.strip() != ""]

# Convert ratings to integers (if not already)
data['reviews.rating'] = data['reviews.rating'].astype(int)

In [None]:
# Step 3: Select Top-K Categories (e.g., Top 10 by review count)
K = 10
top_categories = data['categories'].value_counts().nlargest(K).index
filtered_data = data[data['categories'].isin(top_categories)]

In [None]:
# Step 4: Group Reviews by Category and Rating
grouped_reviews = filtered_data.groupby(['categories', 'reviews.rating'])['reviews.text'].apply(lambda x: " ".join(x)).reset_index()

In [None]:
# Step 5: Summarization Using Generative AI
# Load a pre-trained summarization model (e.g., T5 or GPT-3)
summarizer = pipeline("summarization", model="google/flan-t5-base")  # philschmid/flan-t5-base-samsum

# Function to summarize reviews
def summarize_reviews(text, max_length=512):#power of 2
    try:
        return summarizer(text, max_length=max_length, min_length=30, truncation=True)[0]['summary_text']
    except Exception as e:
        return f"Error during summarization: {str(e)}"

# Apply summarization for each category and rating
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: summarize_reviews(x))

In [None]:
# Step 6: Save or Display Results
output_file = "summarized_reviews.csv"
grouped_reviews.to_csv(output_file, index=False)
print(f"Summarized reviews saved to {output_file}")

In [None]:
from transformers import pipeline

# Load the FLAN-T5 summarization model
summarizer = pipeline("summarization", model="google/flan-t5-base")

# Save the model and tokenizer
save_directory = "./flan_t5_summary_model"
summarizer.model.save_pretrained(save_directory)
summarizer.tokenizer.save_pretrained(save_directory)

print(f"Model saved to {save_directory}")

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Directory where your model and tokenizer are saved
model_directory = "./flan_t5_summary_model"

# Load the saved tokenizer and model from the specified directory
tokenizer = T5Tokenizer.from_pretrained(model_directory)
model = T5ForConditionalGeneration.from_pretrained(model_directory)

def summarize_review(review_text):
    inputs = tokenizer.encode("summarize: " + review_text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example usage:
if __name__ == "__main__":
    review = input("Enter your review: ")
    summary = summarize_review(review)
    print("\nSummary:")
    print(summary)