In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import LatentDirichletAllocation
from textblob import TextBlob
from tqdm import tqdm
from google.colab import drive
from joblib import Parallel, delayed
import os
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.lda_model

# Mount Google Drive
drive.mount('/content/drive')

# Download NLTK data
nltk.download('vader_lexicon')
nltk.download('punkt')

# File paths
file_path = '/content/drive/MyDrive/amazon_reviews_us_Watches_v1_00.tsv'
sentiment_data_path = '/content/drive/MyDrive/amazon_sentiment_data.csv'

# Load Data
if os.path.exists(sentiment_data_path):
    data = pd.read_csv(sentiment_data_path)
else:
    chunks = pd.read_csv(file_path, sep='\t', on_bad_lines='skip', quoting=3, chunksize=100000)
    data_chunks = []

    # Parallelized sentiment scoring
    def process_chunk(chunk):
        sid_obj = SentimentIntensityAnalyzer()

        def sentiment_scores(sentence):
            if not isinstance(sentence, str):
                sentence = str(sentence)
            return sid_obj.polarity_scores(sentence)['compound']

        chunk['sentiment_score'] = chunk['review_body'].apply(sentiment_scores)
        chunk['sentiment'] = chunk['sentiment_score'].apply(lambda x: 'positive' if x >= 0.05 else 'negative')
        return chunk

    # Process each chunk
    data_chunks = Parallel(n_jobs=-1)(delayed(process_chunk)(chunk) for chunk in chunks)
    data = pd.concat(data_chunks, ignore_index=True)
    data.to_csv(sentiment_data_path, index=False)

# Aspect-Based Sentiment Analysis
aspects = ['design', 'durability', 'price', 'brand']

def extract_aspects(review):
    if not isinstance(review, str):
        review = str(review)
    results = {}
    for aspect in aspects:
        if aspect in review:
            polarity = TextBlob(review).sentiment.polarity
            results[aspect] = 'positive' if polarity > 0 else 'negative'
    return results

data['aspect_sentiments'] = Parallel(n_jobs=-1)(delayed(extract_aspects)(review) for review in data['review_body'])

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')
X_train, X_test, y_train, y_test = train_test_split(data['review_body'].fillna(''), data['sentiment'], test_size=0.2, random_state=42)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Logistic Regression with GridSearch
param_grid = {'C': [0.01, 0.1, 1, 10], 'penalty': ['l1', 'l2']}
grid_search = GridSearchCV(LogisticRegression(solver='liblinear', class_weight='balanced'), param_grid, cv=3)
grid_search.fit(X_train_tfidf, y_train)
model = grid_search.best_estimator_

# Evaluation
y_pred = model.predict(X_test_tfidf)
print("Optimized Logistic Regression Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.3f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.3f}")
print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted'):.3f}")

# LDA for Topic Modeling
lda = LatentDirichletAllocation(n_components=5, random_state=42, max_iter=10)
lda.fit(X_train_tfidf)

print("\nLDA Topics:")
for idx, topic in enumerate(lda.components_):
    print(f"Topic {idx}: ", [tfidf.get_feature_names_out()[i] for i in topic.argsort()[-10:]])

# pyLDAvis Visualization
pyLDAvis.enable_notebook()
panel = pyLDAvis.lda_model.prepare(lda, X_train_tfidf, tfidf, mds='tsne')  
pyLDAvis.display(panel)

# Sentiment Distribution Visualization
sns.countplot(data['sentiment'])
plt.title('Sentiment Distribution')
plt.show()

# Word Cloud for Negative Reviews
negative_reviews = data[data['sentiment'] == 'negative']['review_body'].astype(str)
wordcloud = WordCloud(width=800, height=400, stopwords='english').generate(" ".join(negative_reviews))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Negative Reviews')
plt.show()

# Aspect Sentiment Visualization
aspect_data = pd.DataFrame(list(data['aspect_sentiments'].dropna().apply(pd.Series).stack().items()))
aspect_data.columns = ['Index', 'Aspect Sentiment']
aspect_counts = aspect_data['Aspect Sentiment'].value_counts()
sns.barplot(x=aspect_counts.index, y=aspect_counts.values)
plt.title('Aspect-Based Sentiment Distribution')
plt.xlabel('Aspect Sentiment')
plt.ylabel('Count')
plt.show()
